issue-20737: datalake parquet different extensions (#21048)

This commit is contained in:
harshsoni2024 2025-05-13 11:23:46 +05:30 committed by harshsoni2024
parent 0a8421d11f
commit a0195a82a5
7 changed files with 39 additions and 2 deletions

View File

@ -39,6 +39,10 @@ class SupportedTypes(Enum):
TSV = "tsv" TSV = "tsv"
AVRO = "avro" AVRO = "avro"
PARQUET = "parquet" PARQUET = "parquet"
PARQUET_PQ = "pq"
PARQUET_PQT = "pqt"
PARQUET_PARQ = "parq"
PARQUET_SNAPPY = "parquet.snappy"
JSON = "json" JSON = "json"
JSONGZ = "json.gz" JSONGZ = "json.gz"
JSONZIP = "json.zip" JSONZIP = "json.zip"
@ -52,6 +56,10 @@ DF_READER_MAP = {
SupportedTypes.TSV.value: TSVDataFrameReader, SupportedTypes.TSV.value: TSVDataFrameReader,
SupportedTypes.AVRO.value: AvroDataFrameReader, SupportedTypes.AVRO.value: AvroDataFrameReader,
SupportedTypes.PARQUET.value: ParquetDataFrameReader, SupportedTypes.PARQUET.value: ParquetDataFrameReader,
SupportedTypes.PARQUET_PQ.value: ParquetDataFrameReader,
SupportedTypes.PARQUET_PQT.value: ParquetDataFrameReader,
SupportedTypes.PARQUET_PARQ.value: ParquetDataFrameReader,
SupportedTypes.PARQUET_SNAPPY.value: ParquetDataFrameReader,
SupportedTypes.JSON.value: JSONDataFrameReader, SupportedTypes.JSON.value: JSONDataFrameReader,
SupportedTypes.JSONGZ.value: JSONDataFrameReader, SupportedTypes.JSONGZ.value: JSONDataFrameReader,
SupportedTypes.JSONZIP.value: JSONDataFrameReader, SupportedTypes.JSONZIP.value: JSONDataFrameReader,

View File

@ -131,7 +131,13 @@ class DataFrameColumnParser:
shuffle: whether to shuffle the dataframe list or not if sample is True. (default: False) shuffle: whether to shuffle the dataframe list or not if sample is True. (default: False)
""" """
data_frame = cls._get_data_frame(data_frame, sample, shuffle) data_frame = cls._get_data_frame(data_frame, sample, shuffle)
if file_type == SupportedTypes.PARQUET: if file_type in {
SupportedTypes.PARQUET,
SupportedTypes.PARQUET_PQ,
SupportedTypes.PARQUET_PQT,
SupportedTypes.PARQUET_PARQ,
SupportedTypes.PARQUET_SNAPPY,
}:
parser = ParquetDataFrameColumnParser(data_frame) parser = ParquetDataFrameColumnParser(data_frame)
elif file_type in { elif file_type in {
SupportedTypes.JSON, SupportedTypes.JSON,

View File

@ -198,7 +198,14 @@ class TestParquetDataFrameColumnParser(TestCase):
parquet_parser = DataFrameColumnParser.create(self.df, SupportedTypes.PARQUET) parquet_parser = DataFrameColumnParser.create(self.df, SupportedTypes.PARQUET)
self.assertIsInstance(parquet_parser.parser, ParquetDataFrameColumnParser) self.assertIsInstance(parquet_parser.parser, ParquetDataFrameColumnParser)
other_types = [typ for typ in SupportedTypes if typ != SupportedTypes.PARQUET] parquet_types = [
SupportedTypes.PARQUET,
SupportedTypes.PARQUET_PQ,
SupportedTypes.PARQUET_PQT,
SupportedTypes.PARQUET_PARQ,
SupportedTypes.PARQUET_SNAPPY,
]
other_types = [typ for typ in SupportedTypes if typ not in parquet_types]
for other_type in other_types: for other_type in other_types:
with self.subTest(other_type=other_type): with self.subTest(other_type=other_type):
generic_parser = DataFrameColumnParser.create(self.df, other_type) generic_parser = DataFrameColumnParser.create(self.df, other_type)

View File

@ -635,6 +635,10 @@ export enum FileFormat {
Gz = "gz", Gz = "gz",
JSON = "json", JSON = "json",
Parquet = "parquet", Parquet = "parquet",
ParquetPq = "pq",
ParquetPqt = "pqt",
ParquetParq = "parq",
ParquetSnappy = "parquet.snappy",
Tsv = "tsv", Tsv = "tsv",
Zip = "zip", Zip = "zip",
Zstd = "zstd", Zstd = "zstd",

View File

@ -671,6 +671,10 @@ export enum FileFormat {
JsonlGz = "jsonl.gz", JsonlGz = "jsonl.gz",
JsonlZip = "jsonl.zip", JsonlZip = "jsonl.zip",
Parquet = "parquet", Parquet = "parquet",
ParquetPq = "pq",
ParquetPqt = "pqt",
ParquetParq = "parq",
ParquetSnappy = "parquet.snappy",
Tsv = "tsv", Tsv = "tsv",
} }

View File

@ -784,6 +784,10 @@ export enum FileFormat {
Gz = "gz", Gz = "gz",
JSON = "json", JSON = "json",
Parquet = "parquet", Parquet = "parquet",
ParquetPq = "pq",
ParquetPqt = "pqt",
ParquetParq = "parq",
ParquetSnappy = "parquet.snappy",
Tsv = "tsv", Tsv = "tsv",
Zip = "zip", Zip = "zip",
Zstd = "zstd", Zstd = "zstd",

View File

@ -869,6 +869,10 @@ export enum FileFormat {
JsonlGz = "jsonl.gz", JsonlGz = "jsonl.gz",
JsonlZip = "jsonl.zip", JsonlZip = "jsonl.zip",
Parquet = "parquet", Parquet = "parquet",
ParquetPq = "pq",
ParquetPqt = "pqt",
ParquetParq = "parq",
ParquetSnappy = "parquet.snappy",
Tsv = "tsv", Tsv = "tsv",
} }