MINOR: Add support for csv.gz in datalake (#22666)

* MINOR: Add support for csv.gz in datalake

* fileformat change

* Update generated TypeScript types

* pyformat
This commit is contained in:
Mayur Singal 2025-08-01 17:39:19 +05:30 committed by GitHub
parent 1228e9647d
commit fe28faa13f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 199 additions and 7 deletions

View File

@ -54,16 +54,24 @@ class DSVDataFrameReader(DataFrameReader):
super().__init__(config_source, client)
def read_from_pandas(
self, path: str, storage_options: Optional[Dict[str, Any]] = None
self,
path: str,
storage_options: Optional[Dict[str, Any]] = None,
compression: Optional[str] = None,
) -> DatalakeColumnWrapper:
import pandas as pd # pylint: disable=import-outside-toplevel
# Determine compression based on file extension if not provided
if compression is None and path.endswith(".gz"):
compression = "gzip"
chunk_list = []
with pd.read_csv(
path,
sep=self.separator,
chunksize=CHUNKSIZE,
storage_options=storage_options,
compression=compression,
) as reader:
for chunks in reader:
chunk_list.append(chunks)
@ -81,16 +89,47 @@ class DSVDataFrameReader(DataFrameReader):
"""
Read the CSV file from the gcs bucket and return a dataframe
"""
# Determine compression based on file extension
compression = None
if key.endswith(".gz"):
compression = "gzip"
path = f"gs://{bucket_name}/{key}"
return self.read_from_pandas(path=path)
return self.read_from_pandas(path=path, compression=compression)
@_read_dsv_dispatch.register
def _(self, _: S3Config, key: str, bucket_name: str) -> DatalakeColumnWrapper:
path = self.client.get_object(Bucket=bucket_name, Key=key)["Body"]
return self.read_from_pandas(path=path)
import pandas as pd # pylint: disable=import-outside-toplevel
# Determine compression based on file extension
compression = None
if key.endswith(".gz"):
compression = "gzip"
# Get the file content from S3
response = self.client.get_object(Bucket=bucket_name, Key=key)
file_content = response["Body"]
# Read the CSV data directly from the StreamingBody
chunk_list = []
with pd.read_csv(
file_content,
sep=self.separator,
chunksize=CHUNKSIZE,
compression=compression,
) as reader:
for chunks in reader:
chunk_list.append(chunks)
return DatalakeColumnWrapper(dataframes=chunk_list)
@_read_dsv_dispatch.register
def _(self, _: AzureConfig, key: str, bucket_name: str) -> DatalakeColumnWrapper:
# Determine compression based on file extension
compression = None
if key.endswith(".gz"):
compression = "gzip"
storage_options = return_azure_storage_options(self.config_source)
path = AZURE_PATH.format(
bucket_name=bucket_name,
@ -100,13 +139,19 @@ class DSVDataFrameReader(DataFrameReader):
return self.read_from_pandas(
path=path,
storage_options=storage_options,
compression=compression,
)
@_read_dsv_dispatch.register
def _( # pylint: disable=unused-argument
self, _: LocalConfig, key: str, bucket_name: str
) -> DatalakeColumnWrapper:
return self.read_from_pandas(path=key)
# Determine compression based on file extension
compression = None
if key.endswith(".gz"):
compression = "gzip"
return self.read_from_pandas(path=key, compression=compression)
def _read(self, *, key: str, bucket_name: str, **__) -> DatalakeColumnWrapper:
return self._read_dsv_dispatch(

View File

@ -36,6 +36,7 @@ logger = utils_logger()
class SupportedTypes(Enum):
CSV = "csv"
CSVGZ = "csv.gz"
TSV = "tsv"
AVRO = "avro"
PARQUET = "parquet"
@ -53,6 +54,7 @@ class SupportedTypes(Enum):
DF_READER_MAP = {
SupportedTypes.CSV.value: CSVDataFrameReader,
SupportedTypes.CSVGZ.value: CSVDataFrameReader,
SupportedTypes.TSV.value: TSVDataFrameReader,
SupportedTypes.AVRO.value: AvroDataFrameReader,
SupportedTypes.PARQUET.value: ParquetDataFrameReader,
@ -79,7 +81,10 @@ def get_df_reader(
Load the File Reader based on the Config Source
"""
# If we have a DSV file, build a reader dynamically based on the received separator
if type_ in {SupportedTypes.CSV, SupportedTypes.TSV} and separator:
if (
type_ in {SupportedTypes.CSV, SupportedTypes.CSVGZ, SupportedTypes.TSV}
and separator
):
return get_dsv_reader_by_separator(separator=separator)(
config_source=config_source, client=client
)

View File

@ -66,6 +66,7 @@ def fetch_dataframe(
return df_wrapper.dataframes, df_wrapper.raw_data
return df_wrapper.dataframes
except Exception as err:
logger.debug(traceback.format_exc())
logger.error(
f"Error fetching file [{bucket_name}/{key}] using "
f"[{config_source.__class__.__name__}] due to: [{err}]"

View File

@ -23,6 +23,7 @@ from metadata.utils.datalake.datalake_utils import (
DataFrameColumnParser,
GenericDataFrameColumnParser,
ParquetDataFrameColumnParser,
get_file_format_type,
)
STRUCTURE = {
@ -451,3 +452,141 @@ class TestParquetDataFrameColumnParser(TestCase):
with self.subTest(validation=validation):
expected_col, actual_col = validation
self._validate_parsed_column(expected_col, actual_col)
def test_get_file_format_type_csv_gz(self):
"""test get_file_format_type function for csv.gz files"""
# Test csv.gz file detection
result = get_file_format_type("data.csv.gz")
self.assertEqual(result, SupportedTypes.CSVGZ)
# Test regular csv file detection (should still work)
result = get_file_format_type("data.csv")
self.assertEqual(result, SupportedTypes.CSV)
# Test other gzipped files
result = get_file_format_type("data.json.gz")
self.assertEqual(result, SupportedTypes.JSONGZ)
# Test unsupported gzipped format
result = get_file_format_type("data.txt.gz")
self.assertEqual(result, False)
def test_csv_gz_file_format_detection_edge_cases(self):
"""test edge cases for csv.gz file format detection"""
# Test with nested paths
result = get_file_format_type("folder/subfolder/data.csv.gz")
self.assertEqual(result, SupportedTypes.CSVGZ)
# Test with multiple dots
result = get_file_format_type("data.backup.csv.gz")
self.assertEqual(result, SupportedTypes.CSVGZ)
# Test with no extension
result = get_file_format_type("data")
self.assertEqual(result, False)
# Test with just .gz
result = get_file_format_type("data.gz")
self.assertEqual(result, False)
def test_csv_gz_compression_detection(self):
"""test compression detection for various file types"""
# Test csv.gz compression detection
test_cases = [
("data.csv.gz", SupportedTypes.CSVGZ),
("data.csv", SupportedTypes.CSV),
("data.json.gz", SupportedTypes.JSONGZ),
("data.json", SupportedTypes.JSON),
("data.jsonl.gz", SupportedTypes.JSONLGZ),
("data.jsonl", SupportedTypes.JSONL),
("data.parquet", SupportedTypes.PARQUET),
("data.txt.gz", False), # Unsupported
("data.unknown.gz", False), # Unsupported
]
for filename, expected in test_cases:
with self.subTest(filename=filename):
result = get_file_format_type(filename)
self.assertEqual(result, expected, f"Failed for {filename}")
def test_csv_gz_reader_factory_integration(self):
"""test that csv.gz is properly integrated with reader factory"""
from metadata.readers.dataframe.reader_factory import SupportedTypes
# Test that CSVGZ is properly handled
try:
# Test that the enum value exists
self.assertEqual(SupportedTypes.CSVGZ.value, "csv.gz")
# Test that it's different from regular CSV
self.assertNotEqual(SupportedTypes.CSVGZ, SupportedTypes.CSV)
self.assertNotEqual(SupportedTypes.CSVGZ.value, SupportedTypes.CSV.value)
except Exception as e:
self.fail(f"CSVGZ enum test failed: {e}")
def test_csv_gz_supported_types_enum(self):
"""test that CSVGZ is properly defined in SupportedTypes enum"""
# Test that CSVGZ exists in the enum
self.assertIn(SupportedTypes.CSVGZ, SupportedTypes)
self.assertEqual(SupportedTypes.CSVGZ.value, "csv.gz")
# Test that it's different from regular CSV
self.assertNotEqual(SupportedTypes.CSVGZ, SupportedTypes.CSV)
self.assertNotEqual(SupportedTypes.CSVGZ.value, SupportedTypes.CSV.value)
def test_csv_gz_dsv_reader_compression_detection(self):
"""test that DSV reader properly detects compression for csv.gz files"""
from metadata.generated.schema.entity.services.connections.database.datalakeConnection import (
LocalConfig,
)
from metadata.readers.dataframe.dsv import DSVDataFrameReader
# Create a mock config
local_config = LocalConfig()
# Create DSV reader
reader = DSVDataFrameReader(config_source=local_config, client=None)
# Test compression detection logic (this is the same logic used in the dispatch methods)
test_cases = [
("data.csv.gz", "gzip"),
("data.csv", None),
("data.json.gz", "gzip"),
("data.txt.gz", "gzip"),
("data.unknown.gz", "gzip"),
]
for filename, expected_compression in test_cases:
with self.subTest(filename=filename):
# Simulate the compression detection logic from the dispatch methods
compression = None
if filename.endswith(".gz"):
compression = "gzip"
self.assertEqual(
compression,
expected_compression,
f"Compression detection failed for {filename}",
)
def test_csv_gz_integration_completeness(self):
"""test that csv.gz support is complete across all components"""
# Test that CSVGZ is in the reader factory mapping
from metadata.readers.dataframe.reader_factory import (
DF_READER_MAP,
SupportedTypes,
)
# Check that CSVGZ is mapped to CSVDataFrameReader
self.assertIn(SupportedTypes.CSVGZ.value, DF_READER_MAP)
# Test that the get_df_reader function includes CSVGZ in DSV handling
# This should not raise an exception for CSVGZ
try:
# Test that CSVGZ is included in the DSV types
dsv_types = {SupportedTypes.CSV, SupportedTypes.CSVGZ, SupportedTypes.TSV}
self.assertIn(SupportedTypes.CSVGZ, dsv_types)
except Exception as e:
self.fail(f"CSVGZ integration test failed: {e}")

View File

@ -996,7 +996,7 @@
"fileFormat": {
"description": "File format in case of file/datalake tables.",
"type": "string",
"enum": ["csv", "tsv", "avro", "parquet", "pq", "pqt", "parq", "parquet.snappy", "json", "json.gz", "json.zip", "jsonl", "jsonl.gz", "jsonl.zip"]
"enum": ["csv", "csv.gz", "tsv", "avro", "parquet", "pq", "pqt", "parq", "parquet.snappy", "json", "json.gz", "json.zip", "jsonl", "jsonl.gz", "jsonl.zip"]
}
},
"properties": {

View File

@ -668,6 +668,7 @@ export enum ModelType {
export enum FileFormat {
Avro = "avro",
CSV = "csv",
CSVGz = "csv.gz",
JSON = "json",
JSONGz = "json.gz",
JSONZip = "json.zip",

View File

@ -867,6 +867,7 @@ export enum ModelType {
export enum FileFormat {
Avro = "avro",
CSV = "csv",
CSVGz = "csv.gz",
JSON = "json",
JSONGz = "json.gz",
JSONZip = "json.zip",