mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-10-27 16:55:06 +00:00
issue-20737: datalake parquet different extensions (#21048)
This commit is contained in:
parent
0a8421d11f
commit
a0195a82a5
@ -39,6 +39,10 @@ class SupportedTypes(Enum):
|
||||
TSV = "tsv"
|
||||
AVRO = "avro"
|
||||
PARQUET = "parquet"
|
||||
PARQUET_PQ = "pq"
|
||||
PARQUET_PQT = "pqt"
|
||||
PARQUET_PARQ = "parq"
|
||||
PARQUET_SNAPPY = "parquet.snappy"
|
||||
JSON = "json"
|
||||
JSONGZ = "json.gz"
|
||||
JSONZIP = "json.zip"
|
||||
@ -52,6 +56,10 @@ DF_READER_MAP = {
|
||||
SupportedTypes.TSV.value: TSVDataFrameReader,
|
||||
SupportedTypes.AVRO.value: AvroDataFrameReader,
|
||||
SupportedTypes.PARQUET.value: ParquetDataFrameReader,
|
||||
SupportedTypes.PARQUET_PQ.value: ParquetDataFrameReader,
|
||||
SupportedTypes.PARQUET_PQT.value: ParquetDataFrameReader,
|
||||
SupportedTypes.PARQUET_PARQ.value: ParquetDataFrameReader,
|
||||
SupportedTypes.PARQUET_SNAPPY.value: ParquetDataFrameReader,
|
||||
SupportedTypes.JSON.value: JSONDataFrameReader,
|
||||
SupportedTypes.JSONGZ.value: JSONDataFrameReader,
|
||||
SupportedTypes.JSONZIP.value: JSONDataFrameReader,
|
||||
|
||||
@ -131,7 +131,13 @@ class DataFrameColumnParser:
|
||||
shuffle: whether to shuffle the dataframe list or not if sample is True. (default: False)
|
||||
"""
|
||||
data_frame = cls._get_data_frame(data_frame, sample, shuffle)
|
||||
if file_type == SupportedTypes.PARQUET:
|
||||
if file_type in {
|
||||
SupportedTypes.PARQUET,
|
||||
SupportedTypes.PARQUET_PQ,
|
||||
SupportedTypes.PARQUET_PQT,
|
||||
SupportedTypes.PARQUET_PARQ,
|
||||
SupportedTypes.PARQUET_SNAPPY,
|
||||
}:
|
||||
parser = ParquetDataFrameColumnParser(data_frame)
|
||||
elif file_type in {
|
||||
SupportedTypes.JSON,
|
||||
|
||||
@ -198,7 +198,14 @@ class TestParquetDataFrameColumnParser(TestCase):
|
||||
parquet_parser = DataFrameColumnParser.create(self.df, SupportedTypes.PARQUET)
|
||||
self.assertIsInstance(parquet_parser.parser, ParquetDataFrameColumnParser)
|
||||
|
||||
other_types = [typ for typ in SupportedTypes if typ != SupportedTypes.PARQUET]
|
||||
parquet_types = [
|
||||
SupportedTypes.PARQUET,
|
||||
SupportedTypes.PARQUET_PQ,
|
||||
SupportedTypes.PARQUET_PQT,
|
||||
SupportedTypes.PARQUET_PARQ,
|
||||
SupportedTypes.PARQUET_SNAPPY,
|
||||
]
|
||||
other_types = [typ for typ in SupportedTypes if typ not in parquet_types]
|
||||
for other_type in other_types:
|
||||
with self.subTest(other_type=other_type):
|
||||
generic_parser = DataFrameColumnParser.create(self.df, other_type)
|
||||
|
||||
@ -635,6 +635,10 @@ export enum FileFormat {
|
||||
Gz = "gz",
|
||||
JSON = "json",
|
||||
Parquet = "parquet",
|
||||
ParquetPq = "pq",
|
||||
ParquetPqt = "pqt",
|
||||
ParquetParq = "parq",
|
||||
ParquetSnappy = "parquet.snappy",
|
||||
Tsv = "tsv",
|
||||
Zip = "zip",
|
||||
Zstd = "zstd",
|
||||
|
||||
@ -671,6 +671,10 @@ export enum FileFormat {
|
||||
JsonlGz = "jsonl.gz",
|
||||
JsonlZip = "jsonl.zip",
|
||||
Parquet = "parquet",
|
||||
ParquetPq = "pq",
|
||||
ParquetPqt = "pqt",
|
||||
ParquetParq = "parq",
|
||||
ParquetSnappy = "parquet.snappy",
|
||||
Tsv = "tsv",
|
||||
}
|
||||
|
||||
|
||||
@ -784,6 +784,10 @@ export enum FileFormat {
|
||||
Gz = "gz",
|
||||
JSON = "json",
|
||||
Parquet = "parquet",
|
||||
ParquetPq = "pq",
|
||||
ParquetPqt = "pqt",
|
||||
ParquetParq = "parq",
|
||||
ParquetSnappy = "parquet.snappy",
|
||||
Tsv = "tsv",
|
||||
Zip = "zip",
|
||||
Zstd = "zstd",
|
||||
|
||||
@ -869,6 +869,10 @@ export enum FileFormat {
|
||||
JsonlGz = "jsonl.gz",
|
||||
JsonlZip = "jsonl.zip",
|
||||
Parquet = "parquet",
|
||||
ParquetPq = "pq",
|
||||
ParquetPqt = "pqt",
|
||||
ParquetParq = "parq",
|
||||
ParquetSnappy = "parquet.snappy",
|
||||
Tsv = "tsv",
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user