mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-10-28 17:23:28 +00:00
issue-20737: datalake parquet different extensions (#21048)
This commit is contained in:
parent
0a8421d11f
commit
a0195a82a5
@ -39,6 +39,10 @@ class SupportedTypes(Enum):
|
|||||||
TSV = "tsv"
|
TSV = "tsv"
|
||||||
AVRO = "avro"
|
AVRO = "avro"
|
||||||
PARQUET = "parquet"
|
PARQUET = "parquet"
|
||||||
|
PARQUET_PQ = "pq"
|
||||||
|
PARQUET_PQT = "pqt"
|
||||||
|
PARQUET_PARQ = "parq"
|
||||||
|
PARQUET_SNAPPY = "parquet.snappy"
|
||||||
JSON = "json"
|
JSON = "json"
|
||||||
JSONGZ = "json.gz"
|
JSONGZ = "json.gz"
|
||||||
JSONZIP = "json.zip"
|
JSONZIP = "json.zip"
|
||||||
@ -52,6 +56,10 @@ DF_READER_MAP = {
|
|||||||
SupportedTypes.TSV.value: TSVDataFrameReader,
|
SupportedTypes.TSV.value: TSVDataFrameReader,
|
||||||
SupportedTypes.AVRO.value: AvroDataFrameReader,
|
SupportedTypes.AVRO.value: AvroDataFrameReader,
|
||||||
SupportedTypes.PARQUET.value: ParquetDataFrameReader,
|
SupportedTypes.PARQUET.value: ParquetDataFrameReader,
|
||||||
|
SupportedTypes.PARQUET_PQ.value: ParquetDataFrameReader,
|
||||||
|
SupportedTypes.PARQUET_PQT.value: ParquetDataFrameReader,
|
||||||
|
SupportedTypes.PARQUET_PARQ.value: ParquetDataFrameReader,
|
||||||
|
SupportedTypes.PARQUET_SNAPPY.value: ParquetDataFrameReader,
|
||||||
SupportedTypes.JSON.value: JSONDataFrameReader,
|
SupportedTypes.JSON.value: JSONDataFrameReader,
|
||||||
SupportedTypes.JSONGZ.value: JSONDataFrameReader,
|
SupportedTypes.JSONGZ.value: JSONDataFrameReader,
|
||||||
SupportedTypes.JSONZIP.value: JSONDataFrameReader,
|
SupportedTypes.JSONZIP.value: JSONDataFrameReader,
|
||||||
|
|||||||
@ -131,7 +131,13 @@ class DataFrameColumnParser:
|
|||||||
shuffle: whether to shuffle the dataframe list or not if sample is True. (default: False)
|
shuffle: whether to shuffle the dataframe list or not if sample is True. (default: False)
|
||||||
"""
|
"""
|
||||||
data_frame = cls._get_data_frame(data_frame, sample, shuffle)
|
data_frame = cls._get_data_frame(data_frame, sample, shuffle)
|
||||||
if file_type == SupportedTypes.PARQUET:
|
if file_type in {
|
||||||
|
SupportedTypes.PARQUET,
|
||||||
|
SupportedTypes.PARQUET_PQ,
|
||||||
|
SupportedTypes.PARQUET_PQT,
|
||||||
|
SupportedTypes.PARQUET_PARQ,
|
||||||
|
SupportedTypes.PARQUET_SNAPPY,
|
||||||
|
}:
|
||||||
parser = ParquetDataFrameColumnParser(data_frame)
|
parser = ParquetDataFrameColumnParser(data_frame)
|
||||||
elif file_type in {
|
elif file_type in {
|
||||||
SupportedTypes.JSON,
|
SupportedTypes.JSON,
|
||||||
|
|||||||
@ -198,7 +198,14 @@ class TestParquetDataFrameColumnParser(TestCase):
|
|||||||
parquet_parser = DataFrameColumnParser.create(self.df, SupportedTypes.PARQUET)
|
parquet_parser = DataFrameColumnParser.create(self.df, SupportedTypes.PARQUET)
|
||||||
self.assertIsInstance(parquet_parser.parser, ParquetDataFrameColumnParser)
|
self.assertIsInstance(parquet_parser.parser, ParquetDataFrameColumnParser)
|
||||||
|
|
||||||
other_types = [typ for typ in SupportedTypes if typ != SupportedTypes.PARQUET]
|
parquet_types = [
|
||||||
|
SupportedTypes.PARQUET,
|
||||||
|
SupportedTypes.PARQUET_PQ,
|
||||||
|
SupportedTypes.PARQUET_PQT,
|
||||||
|
SupportedTypes.PARQUET_PARQ,
|
||||||
|
SupportedTypes.PARQUET_SNAPPY,
|
||||||
|
]
|
||||||
|
other_types = [typ for typ in SupportedTypes if typ not in parquet_types]
|
||||||
for other_type in other_types:
|
for other_type in other_types:
|
||||||
with self.subTest(other_type=other_type):
|
with self.subTest(other_type=other_type):
|
||||||
generic_parser = DataFrameColumnParser.create(self.df, other_type)
|
generic_parser = DataFrameColumnParser.create(self.df, other_type)
|
||||||
|
|||||||
@ -635,6 +635,10 @@ export enum FileFormat {
|
|||||||
Gz = "gz",
|
Gz = "gz",
|
||||||
JSON = "json",
|
JSON = "json",
|
||||||
Parquet = "parquet",
|
Parquet = "parquet",
|
||||||
|
ParquetPq = "pq",
|
||||||
|
ParquetPqt = "pqt",
|
||||||
|
ParquetParq = "parq",
|
||||||
|
ParquetSnappy = "parquet.snappy",
|
||||||
Tsv = "tsv",
|
Tsv = "tsv",
|
||||||
Zip = "zip",
|
Zip = "zip",
|
||||||
Zstd = "zstd",
|
Zstd = "zstd",
|
||||||
|
|||||||
@ -671,6 +671,10 @@ export enum FileFormat {
|
|||||||
JsonlGz = "jsonl.gz",
|
JsonlGz = "jsonl.gz",
|
||||||
JsonlZip = "jsonl.zip",
|
JsonlZip = "jsonl.zip",
|
||||||
Parquet = "parquet",
|
Parquet = "parquet",
|
||||||
|
ParquetPq = "pq",
|
||||||
|
ParquetPqt = "pqt",
|
||||||
|
ParquetParq = "parq",
|
||||||
|
ParquetSnappy = "parquet.snappy",
|
||||||
Tsv = "tsv",
|
Tsv = "tsv",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -784,6 +784,10 @@ export enum FileFormat {
|
|||||||
Gz = "gz",
|
Gz = "gz",
|
||||||
JSON = "json",
|
JSON = "json",
|
||||||
Parquet = "parquet",
|
Parquet = "parquet",
|
||||||
|
ParquetPq = "pq",
|
||||||
|
ParquetPqt = "pqt",
|
||||||
|
ParquetParq = "parq",
|
||||||
|
ParquetSnappy = "parquet.snappy",
|
||||||
Tsv = "tsv",
|
Tsv = "tsv",
|
||||||
Zip = "zip",
|
Zip = "zip",
|
||||||
Zstd = "zstd",
|
Zstd = "zstd",
|
||||||
|
|||||||
@ -869,6 +869,10 @@ export enum FileFormat {
|
|||||||
JsonlGz = "jsonl.gz",
|
JsonlGz = "jsonl.gz",
|
||||||
JsonlZip = "jsonl.zip",
|
JsonlZip = "jsonl.zip",
|
||||||
Parquet = "parquet",
|
Parquet = "parquet",
|
||||||
|
ParquetPq = "pq",
|
||||||
|
ParquetPqt = "pqt",
|
||||||
|
ParquetParq = "parq",
|
||||||
|
ParquetSnappy = "parquet.snappy",
|
||||||
Tsv = "tsv",
|
Tsv = "tsv",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user