MINOR: support JSONL datalake file types (#16614)

* fix: support JSONL datalake file types

* add jsonl zip file types

* update fileFormat enum in table schema

* add tests

* fix test data ref

* reformat

* fix tests

---------

Co-authored-by: Matthew Chamberlin <mchamberlin@ginkgobioworks.com>
Co-authored-by: Mayur Singal <39544459+ulixius9@users.noreply.github.com>
This commit is contained in:
Matt Chamberlin 2024-06-21 03:54:19 -04:00 committed by GitHub
parent 7411f9e0e1
commit ac6ddbf6c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 44 additions and 3 deletions

View File

@ -42,6 +42,9 @@ class SupportedTypes(Enum):
JSON = "json"
JSONGZ = "json.gz"
JSONZIP = "json.zip"
JSONL = "jsonl"
JSONLGZ = "jsonl.gz"
JSONLZIP = "jsonl.zip"
DF_READER_MAP = {
@ -52,6 +55,9 @@ DF_READER_MAP = {
SupportedTypes.JSON.value: JSONDataFrameReader,
SupportedTypes.JSONGZ.value: JSONDataFrameReader,
SupportedTypes.JSONZIP.value: JSONDataFrameReader,
SupportedTypes.JSONL.value: JSONDataFrameReader,
SupportedTypes.JSONLGZ.value: JSONDataFrameReader,
SupportedTypes.JSONLZIP.value: JSONDataFrameReader,
}

View File

@ -0,0 +1,2 @@
{"id": "1", "first_name": "John", "last_name": "Doe", "city": "Los Angeles", "country": "US", "birthdate": "1980-01-01", "age": "40", "json_data": {"foo": {"bar": "baz"}}}
{"id": "2", "first_name": "James", "last_name": "Doe", "city": "Los Angeles", "country": "US", "birthdate": "1980-01-01", "age": "40", "json_data": {"foo": {"bar": "baz"}}}

View File

@ -37,9 +37,11 @@ class TestDatalake:
) # type: ignore
entities = resp.entities
assert len(entities) == 3
assert len(entities) == 4
names = [entity.name.root for entity in entities]
assert {"names.json", "new_users.parquet", "users.csv"} == set(names)
assert {"names.json", "names.jsonl", "new_users.parquet", "users.csv"} == set(
names
)
for entity in entities:
columns = entity.columns
@ -64,10 +66,19 @@ class TestDatalake:
fqn='datalake_for_integration_tests.default.MyBucket."names.json"',
fields=["tableProfilerConfig"],
)
jsonl_ = self.metadata.get_by_name(
entity=Table,
fqn='datalake_for_integration_tests.default.MyBucket."names.jsonl"',
fields=["tableProfilerConfig"],
)
csv_sample_data = self.metadata.get_sample_data(csv_)
# parquet_sample_data = self.metadata.get_sample_data(parquet_)
json_sample_data = self.metadata.get_sample_data(json_)
jsonl_sample_data = self.metadata.get_sample_data(jsonl_)
assert csv_sample_data.sampleData.rows
# assert parquet_sample_data.sampleData.rows
assert json_sample_data.sampleData.rows
assert jsonl_sample_data.sampleData.rows

View File

@ -104,6 +104,24 @@ class TestDataFrameReader(TestCase):
["name", "id", "version", "Company"],
)
def test_jsonl_reader(self):
key = ROOT_PATH / "employees.jsonl"
df_list = fetch_dataframe(
config_source=LocalConfig(),
client=None,
file_fqn=DatalakeTableSchemaWrapper(key=str(key), bucket_name="unused"),
)
self.assertIsNotNone(df_list)
self.assertTrue(len(df_list))
self.assertEqual(df_list[0].shape, (4, 4))
self.assertEqual(
list(df_list[0].columns),
["name", "id", "version", "Company"],
)
def test_avro_reader(self):
key = ROOT_PATH / "example.avro"

View File

@ -0,0 +1,4 @@
{"name": "Name1", "id": "EMP1", "version": 1, "Company": "Collate Inc."}
{"name": "Name2", "id": "EMP2", "version": 1, "Company": "Collate Inc."}
{"name": "Name3", "id": "EMP3", "version": 1, "Company": "Collate Inc."}
{"name": "Name4", "id": "EMP4", "version": 1, "Company": "Collate Inc."}

View File

@ -906,7 +906,7 @@
"fileFormat": {
"description": "File format in case of file/datalake tables.",
"type": "string",
"enum": ["csv", "tsv", "avro", "parquet", "json", "json.gz", "json.zip"]
"enum": ["csv", "tsv", "avro", "parquet", "json", "json.gz", "json.zip", "jsonl", "jsonl.gz", "jsonl.zip"]
}
},
"properties": {