From a0195a82a5dd0dbcc6e646f408f0caad26170dc4 Mon Sep 17 00:00:00 2001
From: harshsoni2024 <64592571+harshsoni2024@users.noreply.github.com>
Date: Tue, 13 May 2025 11:23:46 +0530
Subject: [PATCH] issue-20737: datalake parquet different extensions (#21048)

---
 .../src/metadata/readers/dataframe/reader_factory.py     | 8 ++++++++
 ingestion/src/metadata/utils/datalake/datalake_utils.py  | 8 +++++++-
 ingestion/tests/unit/utils/test_datalake.py              | 9 ++++++++-
 .../ui/src/generated/api/data/createContainer.ts         | 4 ++++
 .../resources/ui/src/generated/api/data/createTable.ts   | 4 ++++
 .../resources/ui/src/generated/entity/data/container.ts  | 4 ++++
 .../main/resources/ui/src/generated/entity/data/table.ts | 4 ++++
 7 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/ingestion/src/metadata/readers/dataframe/reader_factory.py b/ingestion/src/metadata/readers/dataframe/reader_factory.py
index b456ec71804..906db8d9cac 100644
--- a/ingestion/src/metadata/readers/dataframe/reader_factory.py
+++ b/ingestion/src/metadata/readers/dataframe/reader_factory.py
@@ -39,6 +39,10 @@ class SupportedTypes(Enum):
     TSV = "tsv"
     AVRO = "avro"
     PARQUET = "parquet"
+    PARQUET_PQ = "pq"
+    PARQUET_PQT = "pqt"
+    PARQUET_PARQ = "parq"
+    PARQUET_SNAPPY = "parquet.snappy"
     JSON = "json"
     JSONGZ = "json.gz"
     JSONZIP = "json.zip"
@@ -52,6 +56,10 @@ DF_READER_MAP = {
     SupportedTypes.TSV.value: TSVDataFrameReader,
     SupportedTypes.AVRO.value: AvroDataFrameReader,
     SupportedTypes.PARQUET.value: ParquetDataFrameReader,
+    SupportedTypes.PARQUET_PQ.value: ParquetDataFrameReader,
+    SupportedTypes.PARQUET_PQT.value: ParquetDataFrameReader,
+    SupportedTypes.PARQUET_PARQ.value: ParquetDataFrameReader,
+    SupportedTypes.PARQUET_SNAPPY.value: ParquetDataFrameReader,
     SupportedTypes.JSON.value: JSONDataFrameReader,
     SupportedTypes.JSONGZ.value: JSONDataFrameReader,
     SupportedTypes.JSONZIP.value: JSONDataFrameReader,
diff --git a/ingestion/src/metadata/utils/datalake/datalake_utils.py b/ingestion/src/metadata/utils/datalake/datalake_utils.py
index cfbdce06b5c..70a8cc9c6cc 100644
--- a/ingestion/src/metadata/utils/datalake/datalake_utils.py
+++ b/ingestion/src/metadata/utils/datalake/datalake_utils.py
@@ -131,7 +131,13 @@ class DataFrameColumnParser:
             shuffle: whether to shuffle the dataframe list or not if sample is True. (default: False)
         """
         data_frame = cls._get_data_frame(data_frame, sample, shuffle)
-        if file_type == SupportedTypes.PARQUET:
+        if file_type in {
+            SupportedTypes.PARQUET,
+            SupportedTypes.PARQUET_PQ,
+            SupportedTypes.PARQUET_PQT,
+            SupportedTypes.PARQUET_PARQ,
+            SupportedTypes.PARQUET_SNAPPY,
+        }:
             parser = ParquetDataFrameColumnParser(data_frame)
         elif file_type in {
             SupportedTypes.JSON,
diff --git a/ingestion/tests/unit/utils/test_datalake.py b/ingestion/tests/unit/utils/test_datalake.py
index ae676597be1..baca72b78f2 100644
--- a/ingestion/tests/unit/utils/test_datalake.py
+++ b/ingestion/tests/unit/utils/test_datalake.py
@@ -198,7 +198,14 @@ class TestParquetDataFrameColumnParser(TestCase):
         parquet_parser = DataFrameColumnParser.create(self.df, SupportedTypes.PARQUET)
         self.assertIsInstance(parquet_parser.parser, ParquetDataFrameColumnParser)
 
-        other_types = [typ for typ in SupportedTypes if typ != SupportedTypes.PARQUET]
+        parquet_types = [
+            SupportedTypes.PARQUET,
+            SupportedTypes.PARQUET_PQ,
+            SupportedTypes.PARQUET_PQT,
+            SupportedTypes.PARQUET_PARQ,
+            SupportedTypes.PARQUET_SNAPPY,
+        ]
+        other_types = [typ for typ in SupportedTypes if typ not in parquet_types]
         for other_type in other_types:
             with self.subTest(other_type=other_type):
                 generic_parser = DataFrameColumnParser.create(self.df, other_type)
diff --git a/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createContainer.ts b/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createContainer.ts
index 1d51262abdf..444f3672b9d 100644
--- a/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createContainer.ts
+++ b/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createContainer.ts
@@ -635,6 +635,10 @@ export enum FileFormat {
     Gz = "gz",
     JSON = "json",
     Parquet = "parquet",
+    ParquetPq = "pq",
+    ParquetPqt = "pqt",
+    ParquetParq = "parq",
+    ParquetSnappy = "parquet.snappy",
     Tsv = "tsv",
     Zip = "zip",
     Zstd = "zstd",
diff --git a/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createTable.ts b/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createTable.ts
index 7a097dd3218..52f97246c29 100644
--- a/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createTable.ts
+++ b/openmetadata-ui/src/main/resources/ui/src/generated/api/data/createTable.ts
@@ -671,6 +671,10 @@ export enum FileFormat {
     JsonlGz = "jsonl.gz",
     JsonlZip = "jsonl.zip",
     Parquet = "parquet",
+    ParquetPq = "pq",
+    ParquetPqt = "pqt",
+    ParquetParq = "parq",
+    ParquetSnappy = "parquet.snappy",
     Tsv = "tsv",
 }
 
diff --git a/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/container.ts b/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/container.ts
index 0fb42da5faa..a8e8f7fcf6e 100644
--- a/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/container.ts
+++ b/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/container.ts
@@ -784,6 +784,10 @@ export enum FileFormat {
     Gz = "gz",
     JSON = "json",
     Parquet = "parquet",
+    ParquetPq = "pq",
+    ParquetPqt = "pqt",
+    ParquetParq = "parq",
+    ParquetSnappy = "parquet.snappy",
     Tsv = "tsv",
     Zip = "zip",
     Zstd = "zstd",
diff --git a/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/table.ts b/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/table.ts
index d4473d655b8..77d5dd8c1cf 100644
--- a/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/table.ts
+++ b/openmetadata-ui/src/main/resources/ui/src/generated/entity/data/table.ts
@@ -869,6 +869,10 @@ export enum FileFormat {
     JsonlGz = "jsonl.gz",
     JsonlZip = "jsonl.zip",
     Parquet = "parquet",
+    ParquetPq = "pq",
+    ParquetPqt = "pqt",
+    ParquetParq = "parq",
+    ParquetSnappy = "parquet.snappy",
     Tsv = "tsv",
 }