From a0195a82a5dd0dbcc6e646f408f0caad26170dc4 Mon Sep 17 00:00:00 2001 From: harshsoni2024 <64592571+harshsoni2024@users.noreply.github.com> Date: Tue, 13 May 2025 11:23:46 +0530 Subject: [PATCH] issue-20737: datalake parquet different extensions (#21048) --- .../src/metadata/readers/dataframe/reader_factory.py | 8 ++++++++ ingestion/src/metadata/utils/datalake/datalake_utils.py | 8 +++++++- ingestion/tests/unit/utils/test_datalake.py | 9 ++++++++- .../ui/src/generated/api/data/createContainer.ts | 4 ++++ .../resources/ui/src/generated/api/data/createTable.ts | 4 ++++ .../resources/ui/src/generated/entity/data/container.ts | 4 ++++ .../main/resources/ui/src/generated/entity/data/table.ts | 4 ++++ 7 files changed, 39 insertions(+), 2 deletions(-) diff --git a/ingestion/src/metadata/readers/dataframe/reader_factory.py b/ingestion/src/metadata/readers/dataframe/reader_factory.py index b456ec71804..906db8d9cac 100644 --- a/ingestion/src/metadata/readers/dataframe/reader_factory.py +++ b/ingestion/src/metadata/readers/dataframe/reader_factory.py @@ -39,6 +39,10 @@ class SupportedTypes(Enum): TSV = "tsv" AVRO = "avro" PARQUET = "parquet" + PARQUET_PQ = "pq" + PARQUET_PQT = "pqt" + PARQUET_PARQ = "parq" + PARQUET_SNAPPY = "parquet.snappy" JSON = "json" JSONGZ = "json.gz" JSONZIP = "json.zip" @@ -52,6 +56,10 @@ DF_READER_MAP = { SupportedTypes.TSV.value: TSVDataFrameReader, SupportedTypes.AVRO.value: AvroDataFrameReader, SupportedTypes.PARQUET.value: ParquetDataFrameReader, + SupportedTypes.PARQUET_PQ.value: ParquetDataFrameReader, + SupportedTypes.PARQUET_PQT.value: ParquetDataFrameReader, + SupportedTypes.PARQUET_PARQ.value: ParquetDataFrameReader, + SupportedTypes.PARQUET_SNAPPY.value: ParquetDataFrameReader, SupportedTypes.JSON.value: JSONDataFrameReader, SupportedTypes.JSONGZ.value: JSONDataFrameReader, SupportedTypes.JSONZIP.value: JSONDataFrameReader, diff --git a/ingestion/src/metadata/utils/datalake/datalake_utils.py b/ingestion/src/metadata/utils/datalake/datalake_utils.py index cfbdce06b5c..70a8cc9c6cc 100644 --- a/ingestion/src/metadata/utils/datalake/datalake_utils.py +++ b/ingestion/src/metadata/utils/datalake/datalake_utils.py @@ -131,7 +131,13 @@ class DataFrameColumnParser: shuffle: whether to shuffle the dataframe list or not if sample is True. (default: False) """ data_frame = cls._get_data_frame(data_frame, sample, shuffle) - if file_type == SupportedTypes.PARQUET: + if file_type in { + SupportedTypes.PARQUET, + SupportedTypes.PARQUET_PQ, + SupportedTypes.PARQUET_PQT, + SupportedTypes.PARQUET_PARQ, + SupportedTypes.PARQUET_SNAPPY, + }: parser = ParquetDataFrameColumnParser(data_frame) elif file_type in { SupportedTypes.JSON, diff --git a/ingestion/tests/unit/utils/test_datalake.py b/ingestion/tests/unit/utils/test_datalake.py index ae676597be1..baca72b78f2 100644 --- a/ingestion/tests/unit/utils/test_datalake.py +++ b/ingestion/tests/unit/utils/test_datalake.py @@ -198,7 +198,14 @@ class TestParquetDataFrameColumnParser(TestCase): parquet_parser = DataFrameColumnParser.create(self.df, SupportedTypes.PARQUET) self.assertIsInstance(parquet_parser.parser, ParquetDataFrameColumnParser) - other_types = [typ for typ in SupportedTypes if typ != SupportedTypes.PARQUET] + parquet_types = [ + SupportedTypes.PARQUET, + SupportedTypes.PARQUET_PQ, + SupportedTypes.PARQUET_PQT, + SupportedTypes.PARQUET_PARQ, + SupportedTypes.PARQUET_SNAPPY, + ] + other_types = [typ for typ in SupportedTypes if typ not in parquet_types] for other_type in other_types: with self.subTest(other_type=other_type): generic_parser = DataFrameColumnParser.create(self.df, other_type) diff --git a/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createContainer.ts b/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createContainer.ts index 1d51262abdf..444f3672b9d 100644 --- a/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createContainer.ts +++ b/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createContainer.ts @@ -635,6 +635,10 @@ export enum FileFormat { Gz = "gz", JSON = "json", Parquet = "parquet", + ParquetPq = "pq", + ParquetPqt = "pqt", + ParquetParq = "parq", + ParquetSnappy = "parquet.snappy", Tsv = "tsv", Zip = "zip", Zstd = "zstd", diff --git a/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createTable.ts b/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createTable.ts index 7a097dd3218..52f97246c29 100644 --- a/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createTable.ts +++ b/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createTable.ts @@ -671,6 +671,10 @@ export enum FileFormat { JsonlGz = "jsonl.gz", JsonlZip = "jsonl.zip", Parquet = "parquet", + ParquetPq = "pq", + ParquetPqt = "pqt", + ParquetParq = "parq", + ParquetSnappy = "parquet.snappy", Tsv = "tsv", } diff --git a/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/container.ts b/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/container.ts index 0fb42da5faa..a8e8f7fcf6e 100644 --- a/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/container.ts +++ b/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/container.ts @@ -784,6 +784,10 @@ export enum FileFormat { Gz = "gz", JSON = "json", Parquet = "parquet", + ParquetPq = "pq", + ParquetPqt = "pqt", + ParquetParq = "parq", + ParquetSnappy = "parquet.snappy", Tsv = "tsv", Zip = "zip", Zstd = "zstd", diff --git a/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/table.ts b/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/table.ts index d4473d655b8..77d5dd8c1cf 100644 --- a/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/table.ts +++ b/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/table.ts @@ -869,6 +869,10 @@ export enum FileFormat { JsonlGz = "jsonl.gz", JsonlZip = "jsonl.zip", Parquet = "parquet", + ParquetPq = "pq", + ParquetPqt = "pqt", + ParquetParq = "parq", + ParquetSnappy = "parquet.snappy", Tsv = "tsv", }