mirror of
https://github.com/datahub-project/datahub.git
synced 2025-10-04 05:26:24 +00:00
feat(ingest): Support for JSONL in s3 source with max_rows support (#9921)
Co-authored-by: Aditya <aditya.malik@quillbot.com> Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
This commit is contained in:
parent
f399a872ad
commit
92b1cfa194
@ -21,13 +21,14 @@ Supported file types are as follows:
|
||||
|
||||
- CSV
|
||||
- TSV
|
||||
- JSONL
|
||||
- JSON
|
||||
- Parquet
|
||||
- Apache Avro
|
||||
|
||||
Schemas for Parquet and Avro files are extracted as provided.
|
||||
|
||||
Schemas for schemaless formats (CSV, TSV, JSON) are inferred. For CSV and TSV files, we consider the first 100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#config-details))
|
||||
Schemas for schemaless formats (CSV, TSV, JSONL, JSON) are inferred. For CSV, TSV and JSONL files, we consider the first 100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#config-details))
|
||||
JSON file schemas are inferred on the basis of the entire file (given the difficulty in extracting only the first few objects of the file), which may impact performance.
|
||||
We are working on using iterator-based JSON parsers to avoid reading in the entire JSON object.
|
||||
|
||||
|
@ -19,13 +19,14 @@ Supported file types are as follows:
|
||||
|
||||
- CSV (*.csv)
|
||||
- TSV (*.tsv)
|
||||
- JSONL (*.jsonl)
|
||||
- JSON (*.json)
|
||||
- Parquet (*.parquet)
|
||||
- Apache Avro (*.avro)
|
||||
|
||||
Schemas for Parquet and Avro files are extracted as provided.
|
||||
|
||||
Schemas for schemaless formats (CSV, TSV, JSON) are inferred. For CSV and TSV files, we consider the first 100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#config-details))
|
||||
Schemas for schemaless formats (CSV, TSV, JSONL, JSON) are inferred. For CSV, TSV and JSONL files, we consider the first 100 rows by default, which can be controlled via the `max_rows` recipe parameter (see [below](#config-details))
|
||||
JSON file schemas are inferred on the basis of the entire file (given the difficulty in extracting only the first few objects of the file), which may impact performance.
|
||||
We are working on using iterator-based JSON parsers to avoid reading in the entire JSON object.
|
||||
|
||||
|
@ -377,7 +377,7 @@ class S3Source(StatefulIngestionSourceBase):
|
||||
ignoreLeadingWhiteSpace=True,
|
||||
ignoreTrailingWhiteSpace=True,
|
||||
)
|
||||
elif ext.endswith(".json"):
|
||||
elif ext.endswith(".json") or ext.endswith(".jsonl"):
|
||||
df = self.spark.read.json(file)
|
||||
elif ext.endswith(".avro"):
|
||||
try:
|
||||
@ -441,6 +441,10 @@ class S3Source(StatefulIngestionSourceBase):
|
||||
fields = csv_tsv.TsvInferrer(
|
||||
max_rows=self.source_config.max_rows
|
||||
).infer_schema(file)
|
||||
elif extension == ".jsonl":
|
||||
fields = json.JsonInferrer(
|
||||
max_rows=self.source_config.max_rows, format="jsonl"
|
||||
).infer_schema(file)
|
||||
elif extension == ".json":
|
||||
fields = json.JsonInferrer().infer_schema(file)
|
||||
elif extension == ".avro":
|
||||
|
@ -1,3 +1,4 @@
|
||||
import itertools
|
||||
import logging
|
||||
from typing import IO, Dict, List, Type, Union
|
||||
|
||||
@ -33,14 +34,28 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class JsonInferrer(SchemaInferenceBase):
|
||||
def __init__(self, max_rows: int = 100, format: str = "json"):
|
||||
self.max_rows = max_rows
|
||||
self.format = format
|
||||
|
||||
def infer_schema(self, file: IO[bytes]) -> List[SchemaField]:
|
||||
try:
|
||||
datastore = ujson.load(file)
|
||||
except ujson.JSONDecodeError as e:
|
||||
logger.info(f"Got ValueError: {e}. Retry with jsonlines")
|
||||
if self.format == "jsonl":
|
||||
file.seek(0)
|
||||
reader = jsl.Reader(file)
|
||||
datastore = [obj for obj in reader.iter(type=dict, skip_invalid=True)]
|
||||
datastore = [
|
||||
obj
|
||||
for obj in itertools.islice(
|
||||
reader.iter(type=dict, skip_invalid=True), self.max_rows
|
||||
)
|
||||
]
|
||||
else:
|
||||
try:
|
||||
datastore = ujson.load(file)
|
||||
except ujson.JSONDecodeError as e:
|
||||
logger.info(f"Got ValueError: {e}. Retry with jsonlines")
|
||||
file.seek(0)
|
||||
reader = jsl.Reader(file)
|
||||
datastore = [obj for obj in reader.iter(type=dict, skip_invalid=True)]
|
||||
|
||||
if not isinstance(datastore, list):
|
||||
datastore = [datastore]
|
||||
|
@ -74,6 +74,19 @@ def test_infer_schema_tsv():
|
||||
assert_field_types_match(fields, expected_field_types)
|
||||
|
||||
|
||||
def test_infer_schema_jsonl():
|
||||
with tempfile.TemporaryFile(mode="w+b") as file:
|
||||
file.write(
|
||||
bytes(test_table.to_json(orient="records", lines=True), encoding="utf-8")
|
||||
)
|
||||
file.seek(0)
|
||||
|
||||
fields = json.JsonInferrer(max_rows=100, format="jsonl").infer_schema(file)
|
||||
|
||||
assert_field_paths_match(fields, expected_field_paths)
|
||||
assert_field_types_match(fields, expected_field_types)
|
||||
|
||||
|
||||
def test_infer_schema_json():
|
||||
with tempfile.TemporaryFile(mode="w+b") as file:
|
||||
file.write(bytes(test_table.to_json(orient="records"), encoding="utf-8"))
|
||||
|
Loading…
x
Reference in New Issue
Block a user