diff --git a/ingestion/src/metadata/readers/dataframe/reader_factory.py b/ingestion/src/metadata/readers/dataframe/reader_factory.py index 7e698699364..112f6a727c5 100644 --- a/ingestion/src/metadata/readers/dataframe/reader_factory.py +++ b/ingestion/src/metadata/readers/dataframe/reader_factory.py @@ -42,6 +42,9 @@ class SupportedTypes(Enum): JSON = "json" JSONGZ = "json.gz" JSONZIP = "json.zip" + JSONL = "jsonl" + JSONLGZ = "jsonl.gz" + JSONLZIP = "jsonl.zip" DF_READER_MAP = { @@ -52,6 +55,9 @@ DF_READER_MAP = { SupportedTypes.JSON.value: JSONDataFrameReader, SupportedTypes.JSONGZ.value: JSONDataFrameReader, SupportedTypes.JSONZIP.value: JSONDataFrameReader, + SupportedTypes.JSONL.value: JSONDataFrameReader, + SupportedTypes.JSONLGZ.value: JSONDataFrameReader, + SupportedTypes.JSONLZIP.value: JSONDataFrameReader, } diff --git a/ingestion/tests/integration/datalake/resources/names.jsonl b/ingestion/tests/integration/datalake/resources/names.jsonl new file mode 100644 index 00000000000..fc1e3cc8a8d --- /dev/null +++ b/ingestion/tests/integration/datalake/resources/names.jsonl @@ -0,0 +1,2 @@ +{"id": "1", "first_name": "John", "last_name": "Doe", "city": "Los Angeles", "country": "US", "birthdate": "1980-01-01", "age": "40", "json_data": {"foo": {"bar": "baz"}}} +{"id": "2", "first_name": "James", "last_name": "Doe", "city": "Los Angeles", "country": "US", "birthdate": "1980-01-01", "age": "40", "json_data": {"foo": {"bar": "baz"}}} \ No newline at end of file diff --git a/ingestion/tests/integration/datalake/test_ingestion.py b/ingestion/tests/integration/datalake/test_ingestion.py index 8790d5672b4..58c1847fee0 100644 --- a/ingestion/tests/integration/datalake/test_ingestion.py +++ b/ingestion/tests/integration/datalake/test_ingestion.py @@ -37,9 +37,11 @@ class TestDatalake: ) # type: ignore entities = resp.entities - assert len(entities) == 3 + assert len(entities) == 4 names = [entity.name.root for entity in entities] - assert {"names.json", "new_users.parquet", "users.csv"} == set(names) + assert {"names.json", "names.jsonl", "new_users.parquet", "users.csv"} == set( + names + ) for entity in entities: columns = entity.columns @@ -64,10 +66,19 @@ class TestDatalake: fqn='datalake_for_integration_tests.default.MyBucket."names.json"', fields=["tableProfilerConfig"], ) + + jsonl_ = self.metadata.get_by_name( + entity=Table, + fqn='datalake_for_integration_tests.default.MyBucket."names.jsonl"', + fields=["tableProfilerConfig"], + ) + csv_sample_data = self.metadata.get_sample_data(csv_) # parquet_sample_data = self.metadata.get_sample_data(parquet_) json_sample_data = self.metadata.get_sample_data(json_) + jsonl_sample_data = self.metadata.get_sample_data(jsonl_) assert csv_sample_data.sampleData.rows # assert parquet_sample_data.sampleData.rows assert json_sample_data.sampleData.rows + assert jsonl_sample_data.sampleData.rows diff --git a/ingestion/tests/unit/readers/test_df_reader.py b/ingestion/tests/unit/readers/test_df_reader.py index e6c943856d1..653bfa4620e 100644 --- a/ingestion/tests/unit/readers/test_df_reader.py +++ b/ingestion/tests/unit/readers/test_df_reader.py @@ -104,6 +104,24 @@ class TestDataFrameReader(TestCase): ["name", "id", "version", "Company"], ) + def test_jsonl_reader(self): + key = ROOT_PATH / "employees.jsonl" + + df_list = fetch_dataframe( + config_source=LocalConfig(), + client=None, + file_fqn=DatalakeTableSchemaWrapper(key=str(key), bucket_name="unused"), + ) + + self.assertIsNotNone(df_list) + self.assertTrue(len(df_list)) + + self.assertEqual(df_list[0].shape, (4, 4)) + self.assertEqual( + list(df_list[0].columns), + ["name", "id", "version", "Company"], + ) + def test_avro_reader(self): key = ROOT_PATH / "example.avro" diff --git a/ingestion/tests/unit/resources/datalake/employees.jsonl b/ingestion/tests/unit/resources/datalake/employees.jsonl new file mode 100644 index 00000000000..c2cc238dc12 --- /dev/null +++ b/ingestion/tests/unit/resources/datalake/employees.jsonl @@ -0,0 +1,4 @@ +{"name": "Name1", "id": "EMP1", "version": 1, "Company": "Collate Inc."} +{"name": "Name2", "id": "EMP2", "version": 1, "Company": "Collate Inc."} +{"name": "Name3", "id": "EMP3", "version": 1, "Company": "Collate Inc."} +{"name": "Name4", "id": "EMP4", "version": 1, "Company": "Collate Inc."} \ No newline at end of file diff --git a/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json b/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json index 9189e904c9b..cf0a469c8fc 100644 --- a/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json +++ b/openmetadata-spec/src/main/resources/json/schema/entity/data/table.json @@ -906,7 +906,7 @@ "fileFormat": { "description": "File format in case of file/datalake tables.", "type": "string", - "enum": ["csv", "tsv", "avro", "parquet", "json", "json.gz", "json.zip"] + "enum": ["csv", "tsv", "avro", "parquet", "json", "json.gz", "json.zip", "jsonl", "jsonl.gz", "jsonl.zip"] } }, "properties": {