MINOR: support JSONL datalake file types (#16614)

* fix: support JSONL datalake file types * add jsonl zip file types * update fileFormat enum in table schema * add tests * fix test data ref * reformat * fix tests --------- Co-authored-by: Matthew Chamberlin <mchamberlin@ginkgobioworks.com> Co-authored-by: Mayur Singal <39544459+ulixius9@users.noreply.github.com>
2025-11-01 19:18:05 +00:00 · 2024-06-21 03:54:19 -04:00 · 2024-06-21 03:54:19 -04:00 · ac6ddbf6c4
commit ac6ddbf6c4
parent 7411f9e0e1
6 changed files with 44 additions and 3 deletions
--- a/ingestion/src/metadata/readers/dataframe/reader_factory.py
+++ b/ingestion/src/metadata/readers/dataframe/reader_factory.py
@ -42,6 +42,9 @@ class SupportedTypes(Enum):
    JSON = "json"
    JSONGZ = "json.gz"
    JSONZIP = "json.zip"
    JSONL = "jsonl"
    JSONLGZ = "jsonl.gz"
    JSONLZIP = "jsonl.zip"
 DF_READER_MAP = {
@ -52,6 +55,9 @@ DF_READER_MAP = {
    SupportedTypes.JSON.value: JSONDataFrameReader,
    SupportedTypes.JSONGZ.value: JSONDataFrameReader,
    SupportedTypes.JSONZIP.value: JSONDataFrameReader,
    SupportedTypes.JSONL.value: JSONDataFrameReader,
    SupportedTypes.JSONLGZ.value: JSONDataFrameReader,
    SupportedTypes.JSONLZIP.value: JSONDataFrameReader,
 }
--- a/ingestion/tests/integration/datalake/resources/names.jsonl
+++ b/ingestion/tests/integration/datalake/resources/names.jsonl
@ -0,0 +1,2 @@
 {"id": "1", "first_name": "John", "last_name": "Doe", "city": "Los Angeles", "country": "US", "birthdate": "1980-01-01", "age": "40", "json_data": {"foo": {"bar": "baz"}}}
 {"id": "2", "first_name": "James", "last_name": "Doe", "city": "Los Angeles", "country": "US", "birthdate": "1980-01-01", "age": "40", "json_data": {"foo": {"bar": "baz"}}}
--- a/ingestion/tests/integration/datalake/test_ingestion.py
+++ b/ingestion/tests/integration/datalake/test_ingestion.py
@ -37,9 +37,11 @@ class TestDatalake:
        )  # type: ignore
        entities = resp.entities
-        assert len(entities) == 3
+        assert len(entities) == 4
        names = [entity.name.root for entity in entities]
-        assert {"names.json", "new_users.parquet", "users.csv"} == set(names)
+        assert {"names.json", "names.jsonl", "new_users.parquet", "users.csv"} == set(
            names
        )
        for entity in entities:
            columns = entity.columns
@ -64,10 +66,19 @@ class TestDatalake:
            fqn='datalake_for_integration_tests.default.MyBucket."names.json"',
            fields=["tableProfilerConfig"],
        )
        jsonl_ = self.metadata.get_by_name(
            entity=Table,
            fqn='datalake_for_integration_tests.default.MyBucket."names.jsonl"',
            fields=["tableProfilerConfig"],
        )
        csv_sample_data = self.metadata.get_sample_data(csv_)
        # parquet_sample_data = self.metadata.get_sample_data(parquet_)
        json_sample_data = self.metadata.get_sample_data(json_)
        jsonl_sample_data = self.metadata.get_sample_data(jsonl_)
        assert csv_sample_data.sampleData.rows
        # assert parquet_sample_data.sampleData.rows
        assert json_sample_data.sampleData.rows
        assert jsonl_sample_data.sampleData.rows
--- a/ingestion/tests/unit/readers/test_df_reader.py
+++ b/ingestion/tests/unit/readers/test_df_reader.py
@ -104,6 +104,24 @@ class TestDataFrameReader(TestCase):
            ["name", "id", "version", "Company"],
        )
    def test_jsonl_reader(self):
        key = ROOT_PATH / "employees.jsonl"
        df_list = fetch_dataframe(
            config_source=LocalConfig(),
            client=None,
            file_fqn=DatalakeTableSchemaWrapper(key=str(key), bucket_name="unused"),
        )
        self.assertIsNotNone(df_list)
        self.assertTrue(len(df_list))
        self.assertEqual(df_list[0].shape, (4, 4))
        self.assertEqual(
            list(df_list[0].columns),
            ["name", "id", "version", "Company"],
        )
    def test_avro_reader(self):
        key = ROOT_PATH / "example.avro"
--- a/ingestion/tests/unit/resources/datalake/employees.jsonl
+++ b/ingestion/tests/unit/resources/datalake/employees.jsonl
@ -0,0 +1,4 @@
 {"name": "Name1", "id": "EMP1", "version": 1, "Company": "Collate Inc."}
 {"name": "Name2", "id": "EMP2", "version": 1, "Company": "Collate Inc."}
 {"name": "Name3", "id": "EMP3", "version": 1, "Company": "Collate Inc."}
 {"name": "Name4", "id": "EMP4", "version": 1, "Company": "Collate Inc."}
--- a/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json
+++ b/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json
@ -906,7 +906,7 @@
    "fileFormat": {
      "description": "File format in case of file/datalake tables.",
      "type": "string",
-      "enum": ["csv", "tsv", "avro", "parquet", "json", "json.gz", "json.zip"]
+      "enum": ["csv", "tsv", "avro", "parquet", "json", "json.gz", "json.zip", "jsonl", "jsonl.gz", "jsonl.zip"]
    }
  },
  "properties": {
		`@ -0,0 +1,2 @@`
							`{"id": "1", "first_name": "John", "last_name": "Doe", "city": "Los Angeles", "country": "US", "birthdate": "1980-01-01", "age": "40", "json_data": {"foo": {"bar": "baz"}}}`
							`{"id": "2", "first_name": "James", "last_name": "Doe", "city": "Los Angeles", "country": "US", "birthdate": "1980-01-01", "age": "40", "json_data": {"foo": {"bar": "baz"}}}`