From f6d5c7413f0c69a6cd77a0b3c2a993bc6ff1aca5 Mon Sep 17 00:00:00 2001 From: Mayur Singal <39544459+ulixius9@users.noreply.github.com> Date: Tue, 22 Aug 2023 13:16:22 +0530 Subject: [PATCH] Fix #6700: Add support for table properties: file format for datalake (#12920) * Fix #6700: Add support for table properties: file format for datalake & storage * pylint fix * resolve review comments --- .../ingestion/source/database/datalake/metadata.py | 8 +++++--- .../ingestion/source/storage/s3/metadata.py | 1 + .../src/metadata/utils/datalake/datalake_utils.py | 13 +++++++++++++ .../service/resources/databases/TableResource.java | 1 + .../resources/json/schema/api/data/createTable.json | 4 ++++ .../resources/json/schema/entity/data/table.json | 9 +++++++++ 6 files changed, 33 insertions(+), 3 deletions(-) diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/metadata.py b/ingestion/src/metadata/ingestion/source/database/datalake/metadata.py index e60087a81a3..96ef322eb1c 100644 --- a/ingestion/src/metadata/ingestion/source/database/datalake/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/datalake/metadata.py @@ -60,7 +60,7 @@ from metadata.readers.dataframe.models import DatalakeTableSchemaWrapper from metadata.readers.dataframe.reader_factory import SupportedTypes from metadata.utils import fqn from metadata.utils.constants import COMPLEX_COLUMN_SEPARATOR, DEFAULT_DATABASE -from metadata.utils.datalake.datalake_utils import fetch_dataframe +from metadata.utils.datalake.datalake_utils import fetch_dataframe, get_file_format_type from metadata.utils.filters import filter_by_schema, filter_by_table from metadata.utils.logger import ingestion_logger from metadata.utils.s3_utils import list_s3_objects @@ -238,7 +238,7 @@ class DatalakeSource(DatabaseServiceSource): def get_tables_name_and_type( # pylint: disable=too-many-branches self, - ) -> Optional[Iterable[Tuple[str, str]]]: + ) -> Iterable[Tuple[str, TableType]]: """ Handle table and views. @@ -371,6 +371,7 @@ class DatalakeSource(DatabaseServiceSource): bucket_name=schema_name, ), ) + # If no data_frame (due to unsupported type), ignore columns = self.get_columns(data_frame[0]) if data_frame else None if columns: @@ -380,6 +381,7 @@ class DatalakeSource(DatabaseServiceSource): columns=columns, tableConstraints=table_constraints if table_constraints else None, databaseSchema=self.context.database_schema.fullyQualifiedName, + fileFormat=get_file_format_type(table_name), ) yield table_request self.register_record(table_request=table_request) @@ -552,7 +554,7 @@ class DatalakeSource(DatabaseServiceSource): complex_col_dict.clear() return cols - def yield_view_lineage(self) -> Optional[Iterable[AddLineageRequest]]: + def yield_view_lineage(self) -> Iterable[AddLineageRequest]: yield from [] def yield_tag(self, schema_name: str) -> Iterable[OMetaTagAndClassification]: diff --git a/ingestion/src/metadata/ingestion/source/storage/s3/metadata.py b/ingestion/src/metadata/ingestion/source/storage/s3/metadata.py index c086eeb1f72..7734953fe61 100644 --- a/ingestion/src/metadata/ingestion/source/storage/s3/metadata.py +++ b/ingestion/src/metadata/ingestion/source/storage/s3/metadata.py @@ -145,6 +145,7 @@ class S3Source(StorageServiceSource): service=self.context.objectstore_service.fullyQualifiedName, parent=container_details.parent, sourceUrl=container_details.sourceUrl, + fileFormats=container_details.file_formats, ) def _generate_container_details( diff --git a/ingestion/src/metadata/utils/datalake/datalake_utils.py b/ingestion/src/metadata/utils/datalake/datalake_utils.py index 5f45dd0de3c..02ecd69f05d 100644 --- a/ingestion/src/metadata/utils/datalake/datalake_utils.py +++ b/ingestion/src/metadata/utils/datalake/datalake_utils.py @@ -62,3 +62,16 @@ def fetch_dataframe( raise err return None + + +def get_file_format_type( + key: str, +) -> Optional[str]: + """ + Method to get file format type + """ + for supported_type in SupportedTypes: + if key.endswith(supported_type.value): + return supported_type.value + + return None diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/resources/databases/TableResource.java b/openmetadata-service/src/main/java/org/openmetadata/service/resources/databases/TableResource.java index 4fcec6a2547..09f7be9a45f 100644 --- a/openmetadata-service/src/main/java/org/openmetadata/service/resources/databases/TableResource.java +++ b/openmetadata-service/src/main/java/org/openmetadata/service/resources/databases/TableResource.java @@ -923,6 +923,7 @@ public class TableResource extends EntityResource