OpenMetadata/ingestion/src/metadata/readers/dataframe/json.py

#  Copyright 2021 Collate
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
JSON DataFrame reader
"""
import gzip
import io
import json
import zipfile
from typing import Any, Dict, List, Optional, Tuple, Union

from metadata.readers.dataframe.base import DataFrameReader
from metadata.readers.dataframe.common import dataframe_to_chunks
from metadata.readers.dataframe.models import DatalakeColumnWrapper
from metadata.utils.constants import UTF_8
from metadata.utils.logger import ingestion_logger

logger = ingestion_logger()


def _get_json_text(key: str, text: bytes, decode: bool) -> Union[str, bytes]:
    processed_text: Union[str, bytes] = text
    if key.endswith(".gz"):
        processed_text = gzip.decompress(text)
    if key.endswith(".zip"):
        with zipfile.ZipFile(io.BytesIO(text)) as zip_file:
            processed_text = zip_file.read(zip_file.infolist()[0]).decode(UTF_8)
    if decode:
        return processed_text.decode(UTF_8) if isinstance(text, bytes) else text
    return processed_text


class JSONDataFrameReader(DataFrameReader):
    """
    Read JSON DFs
    """

    @staticmethod
    def read_from_json(
        key: str, json_text: bytes, decode: bool = False, **__
    ) -> Tuple[List["DataFrame"], Optional[Dict[str, Any]]]:
        """
        Decompress a JSON file (if needed) and read its contents
        as a dataframe.

        Note that for the metadata we need to flag nested columns with a
        custom separator. For the profiler this is not needed. We require the
        correct column name to match with the metadata description.
        """
        # pylint: disable=import-outside-toplevel
        import pandas as pd

        json_text = _get_json_text(key=key, text=json_text, decode=decode)
        raw_data = None
        try:
            data = json.loads(json_text)
            if isinstance(data, dict) and data.get("$schema"):
                raw_data = json_text
        except json.decoder.JSONDecodeError:
            logger.debug("Failed to read as JSON object. Trying to read as JSON Lines")
            data = [json.loads(json_obj) for json_obj in json_text.strip().split("\n")]

        # if we get a scalar value (e.g. {"a":"b"}) then we need to specify the index
        data = data if not isinstance(data, dict) else [data]
        return dataframe_to_chunks(pd.DataFrame.from_records(data)), raw_data

    def _read(self, *, key: str, bucket_name: str, **kwargs) -> DatalakeColumnWrapper:
        text = self.reader.read(key, bucket_name=bucket_name)
        dataframes, raw_data = self.read_from_json(
            key=key, json_text=text, decode=True, **kwargs
        )
        return DatalakeColumnWrapper(
            dataframes=dataframes,
            raw_data=raw_data,
        )
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`# Copyright 2021 Collate`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`"""`
			`JSON DataFrame reader`
			`"""`
			`import gzip`
			`import io`
			`import json`
			`import zipfile`
MINOR: Add support for json schema parsing for datalake & s3 (#15615) 2024-03-26 10:03:21 +05:30			`from typing import Any, Dict, List, Optional, Tuple, Union`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00
			`from metadata.readers.dataframe.base import DataFrameReader`
			`from metadata.readers.dataframe.common import dataframe_to_chunks`
			`from metadata.readers.dataframe.models import DatalakeColumnWrapper`
Fixes #12601 - column filter for profiler workflow (#13535) * fix: sample data ingestion to match entity profiler column setting * fix: python linting * fix: updated fn call * fix: added logic to handle json filed in datalake connector * fix: handle NA values in parsing * fix: reverted sampler changes from #13338 * fix: reverted metric changes from #13338 * fix: added datalake profiler ingestion test * fix: python linting * fix: removed normalization of json blob in NoSQL db 2023-10-12 14:51:38 +02:00			`from metadata.utils.constants import UTF_8`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`from metadata.utils.logger import ingestion_logger`

			`logger = ingestion_logger()`


			`def _get_json_text(key: str, text: bytes, decode: bool) -> Union[str, bytes]:`
Fixes #14215: Add missing decode stage to gz/zip files in json ingestion reader. (#14375) * add decoding stage to gz/zip files. Files that where zip/gz where not being decoded. This was leading to a error when we wanted them to be. * remove unnecessary comment --------- Co-authored-by: Carl Kristensen <carl.johan.coelho.kristensen@schibsted.com> 2023-12-14 12:47:58 +01:00			`processed_text: Union[str, bytes] = text`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`if key.endswith(".gz"):`
Fixes #14215: Add missing decode stage to gz/zip files in json ingestion reader. (#14375) * add decoding stage to gz/zip files. Files that where zip/gz where not being decoded. This was leading to a error when we wanted them to be. * remove unnecessary comment --------- Co-authored-by: Carl Kristensen <carl.johan.coelho.kristensen@schibsted.com> 2023-12-14 12:47:58 +01:00			`processed_text = gzip.decompress(text)`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`if key.endswith(".zip"):`
			`with zipfile.ZipFile(io.BytesIO(text)) as zip_file:`
Fixes #14215: Add missing decode stage to gz/zip files in json ingestion reader. (#14375) * add decoding stage to gz/zip files. Files that where zip/gz where not being decoded. This was leading to a error when we wanted them to be. * remove unnecessary comment --------- Co-authored-by: Carl Kristensen <carl.johan.coelho.kristensen@schibsted.com> 2023-12-14 12:47:58 +01:00			`processed_text = zip_file.read(zip_file.infolist()[0]).decode(UTF_8)`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`if decode:`
Fixes #14215: Add missing decode stage to gz/zip files in json ingestion reader. (#14375) * add decoding stage to gz/zip files. Files that where zip/gz where not being decoded. This was leading to a error when we wanted them to be. * remove unnecessary comment --------- Co-authored-by: Carl Kristensen <carl.johan.coelho.kristensen@schibsted.com> 2023-12-14 12:47:58 +01:00			`return processed_text.decode(UTF_8) if isinstance(text, bytes) else text`
			`return processed_text`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00

			`class JSONDataFrameReader(DataFrameReader):`
			`"""`
			`Read JSON DFs`
			`"""`

			`@staticmethod`
			`def read_from_json(`
Fixes #13052: Datalake Nested Columns Sample Data ingestion (#13338) 2023-10-08 20:08:51 +05:30			`key: str, json_text: bytes, decode: bool = False, **__`
MINOR: Add support for json schema parsing for datalake & s3 (#15615) 2024-03-26 10:03:21 +05:30			`) -> Tuple[List["DataFrame"], Optional[Dict[str, Any]]]:`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`"""`
			`Decompress a JSON file (if needed) and read its contents`
			`as a dataframe.`

			`Note that for the metadata we need to flag nested columns with a`
			`custom separator. For the profiler this is not needed. We require the`
			`correct column name to match with the metadata description.`
			`"""`
			`# pylint: disable=import-outside-toplevel`
Fixes #12601 - column filter for profiler workflow (#13535) * fix: sample data ingestion to match entity profiler column setting * fix: python linting * fix: updated fn call * fix: added logic to handle json filed in datalake connector * fix: handle NA values in parsing * fix: reverted sampler changes from #13338 * fix: reverted metric changes from #13338 * fix: added datalake profiler ingestion test * fix: python linting * fix: removed normalization of json blob in NoSQL db 2023-10-12 14:51:38 +02:00			`import pandas as pd`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00
			`json_text = _get_json_text(key=key, text=json_text, decode=decode)`
MINOR: Add support for json schema parsing for datalake & s3 (#15615) 2024-03-26 10:03:21 +05:30			`raw_data = None`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`try:`
			`data = json.loads(json_text)`
MINOR: Add support for json schema parsing for datalake & s3 (#15615) 2024-03-26 10:03:21 +05:30			`if isinstance(data, dict) and data.get("$schema"):`
			`raw_data = json_text`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`except json.decoder.JSONDecodeError:`
			`logger.debug("Failed to read as JSON object. Trying to read as JSON Lines")`
			`data = [json.loads(json_obj) for json_obj in json_text.strip().split("\n")]`

Fixes #12601 - column filter for profiler workflow (#13535) * fix: sample data ingestion to match entity profiler column setting * fix: python linting * fix: updated fn call * fix: added logic to handle json filed in datalake connector * fix: handle NA values in parsing * fix: reverted sampler changes from #13338 * fix: reverted metric changes from #13338 * fix: added datalake profiler ingestion test * fix: python linting * fix: removed normalization of json blob in NoSQL db 2023-10-12 14:51:38 +02:00			`# if we get a scalar value (e.g. {"a":"b"}) then we need to specify the index`
			`data = data if not isinstance(data, dict) else [data]`
MINOR: Add support for json schema parsing for datalake & s3 (#15615) 2024-03-26 10:03:21 +05:30			`return dataframe_to_chunks(pd.DataFrame.from_records(data)), raw_data`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00
			`def _read(self, , key: str, bucket_name: str, *kwargs) -> DatalakeColumnWrapper:`
			`text = self.reader.read(key, bucket_name=bucket_name)`
MINOR: Add support for json schema parsing for datalake & s3 (#15615) 2024-03-26 10:03:21 +05:30			`dataframes, raw_data = self.read_from_json(`
			`key=key, json_text=text, decode=True, **kwargs`
			`)`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`return DatalakeColumnWrapper(`
MINOR: Add support for json schema parsing for datalake & s3 (#15615) 2024-03-26 10:03:21 +05:30			`dataframes=dataframes,`
			`raw_data=raw_data,`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`)`