issue-20737: datalake parquet different extensions (#21048)

This commit is contained in:
harshsoni2024 2025-05-13 11:23:46 +05:30 committed by harshsoni2024
parent 0a8421d11f
commit a0195a82a5
7 changed files with 39 additions and 2 deletions

View File

@ -39,6 +39,10 @@ class SupportedTypes(Enum):
TSV = "tsv"
AVRO = "avro"
PARQUET = "parquet"
PARQUET_PQ = "pq"
PARQUET_PQT = "pqt"
PARQUET_PARQ = "parq"
PARQUET_SNAPPY = "parquet.snappy"
JSON = "json"
JSONGZ = "json.gz"
JSONZIP = "json.zip"
@ -52,6 +56,10 @@ DF_READER_MAP = {
SupportedTypes.TSV.value: TSVDataFrameReader,
SupportedTypes.AVRO.value: AvroDataFrameReader,
SupportedTypes.PARQUET.value: ParquetDataFrameReader,
SupportedTypes.PARQUET_PQ.value: ParquetDataFrameReader,
SupportedTypes.PARQUET_PQT.value: ParquetDataFrameReader,
SupportedTypes.PARQUET_PARQ.value: ParquetDataFrameReader,
SupportedTypes.PARQUET_SNAPPY.value: ParquetDataFrameReader,
SupportedTypes.JSON.value: JSONDataFrameReader,
SupportedTypes.JSONGZ.value: JSONDataFrameReader,
SupportedTypes.JSONZIP.value: JSONDataFrameReader,

View File

@ -131,7 +131,13 @@ class DataFrameColumnParser:
shuffle: whether to shuffle the dataframe list or not if sample is True. (default: False)
"""
data_frame = cls._get_data_frame(data_frame, sample, shuffle)
if file_type == SupportedTypes.PARQUET:
if file_type in {
SupportedTypes.PARQUET,
SupportedTypes.PARQUET_PQ,
SupportedTypes.PARQUET_PQT,
SupportedTypes.PARQUET_PARQ,
SupportedTypes.PARQUET_SNAPPY,
}:
parser = ParquetDataFrameColumnParser(data_frame)
elif file_type in {
SupportedTypes.JSON,

View File

@ -198,7 +198,14 @@ class TestParquetDataFrameColumnParser(TestCase):
parquet_parser = DataFrameColumnParser.create(self.df, SupportedTypes.PARQUET)
self.assertIsInstance(parquet_parser.parser, ParquetDataFrameColumnParser)
other_types = [typ for typ in SupportedTypes if typ != SupportedTypes.PARQUET]
parquet_types = [
SupportedTypes.PARQUET,
SupportedTypes.PARQUET_PQ,
SupportedTypes.PARQUET_PQT,
SupportedTypes.PARQUET_PARQ,
SupportedTypes.PARQUET_SNAPPY,
]
other_types = [typ for typ in SupportedTypes if typ not in parquet_types]
for other_type in other_types:
with self.subTest(other_type=other_type):
generic_parser = DataFrameColumnParser.create(self.df, other_type)

View File

@ -635,6 +635,10 @@ export enum FileFormat {
Gz = "gz",
JSON = "json",
Parquet = "parquet",
ParquetPq = "pq",
ParquetPqt = "pqt",
ParquetParq = "parq",
ParquetSnappy = "parquet.snappy",
Tsv = "tsv",
Zip = "zip",
Zstd = "zstd",

View File

@ -671,6 +671,10 @@ export enum FileFormat {
JsonlGz = "jsonl.gz",
JsonlZip = "jsonl.zip",
Parquet = "parquet",
ParquetPq = "pq",
ParquetPqt = "pqt",
ParquetParq = "parq",
ParquetSnappy = "parquet.snappy",
Tsv = "tsv",
}

View File

@ -784,6 +784,10 @@ export enum FileFormat {
Gz = "gz",
JSON = "json",
Parquet = "parquet",
ParquetPq = "pq",
ParquetPqt = "pqt",
ParquetParq = "parq",
ParquetSnappy = "parquet.snappy",
Tsv = "tsv",
Zip = "zip",
Zstd = "zstd",

View File

@ -869,6 +869,10 @@ export enum FileFormat {
JsonlGz = "jsonl.gz",
JsonlZip = "jsonl.zip",
Parquet = "parquet",
ParquetPq = "pq",
ParquetPqt = "pqt",
ParquetParq = "parq",
ParquetSnappy = "parquet.snappy",
Tsv = "tsv",
}