OpenMetadata/ingestion/src/metadata/utils/datalake/datalake_utils.py

#  Copyright 2021 Collate
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#  http://www.apache.org/licenses/LICENSE-2.0
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.

"""
Module to define helper methods for datalake and to fetch data and metadata 
from different auths and different file systems.
"""
import ast
import traceback
from typing import List, Optional

from metadata.generated.schema.entity.data.table import Column, DataType
from metadata.ingestion.source.database.column_helpers import truncate_column_name
from metadata.ingestion.source.database.datalake.columns import clean_dataframe
from metadata.readers.dataframe.models import (
    DatalakeColumnWrapper,
    DatalakeTableSchemaWrapper,
)
from metadata.readers.dataframe.reader_factory import SupportedTypes, get_df_reader
from metadata.utils.constants import COMPLEX_COLUMN_SEPARATOR
from metadata.utils.logger import utils_logger

logger = utils_logger()

DATALAKE_DATA_TYPES = {
    **dict.fromkeys(["int64", "int", "int32"], DataType.INT),
    "dict": DataType.JSON,
    "list": DataType.ARRAY,
    **dict.fromkeys(["float64", "float32", "float"], DataType.FLOAT),
    "bool": DataType.BOOLEAN,
    **dict.fromkeys(
        ["datetime64", "timedelta[ns]", "datetime64[ns]"], DataType.DATETIME
    ),
    "str": DataType.STRING,
}


def fetch_dataframe(
    config_source,
    client,
    file_fqn: DatalakeTableSchemaWrapper,
    **kwargs,
) -> Optional[List["DataFrame"]]:
    """
    Method to get dataframe for profiling
    """
    # dispatch to handle fetching of data from multiple file formats (csv, tsv, json, avro and parquet)
    key: str = file_fqn.key
    bucket_name: str = file_fqn.bucket_name
    try:
        file_extension: Optional[SupportedTypes] = file_fqn.file_extension or next(
            supported_type or None
            for supported_type in SupportedTypes
            if key.endswith(supported_type.value)
        )
        if file_extension and not key.endswith("/"):
            df_reader = get_df_reader(
                type_=file_extension,
                config_source=config_source,
                client=client,
            )
            try:
                df_wrapper: DatalakeColumnWrapper = df_reader.read(
                    key=key, bucket_name=bucket_name, **kwargs
                )
                return df_wrapper.dataframes
            except Exception as err:
                logger.error(
                    f"Error fetching file [{bucket_name}/{key}] using "
                    f"[{config_source.__class__.__name__}] due to: [{err}]"
                )
    except Exception as err:
        logger.error(
            f"Error fetching file [{bucket_name}/{key}] using [{config_source.__class__.__name__}] due to: [{err}]"
        )
        # Here we need to blow things up. Without the dataframe we cannot move forward
        raise err

    return None


def get_file_format_type(key_name, metadata_entry=None):
    for supported_types in SupportedTypes:
        if key_name.endswith(supported_types.value):
            return supported_types
        if metadata_entry:
            entry: list = [
                entry for entry in metadata_entry.entries if key_name == entry.dataPath
            ]
            if entry and supported_types.value == entry[0].structureFormat:
                return supported_types
    return False


def _parse_complex_column(
    data_frame,
    column,
    final_column_list: List[Column],
    complex_col_dict: dict,
    processed_complex_columns: set,
) -> None:
    """
    This class parses the complex columns

    for example consider this data:
        {
            "level1": {
                "level2":{
                    "level3": 1
                }
            }
        }

    pandas would name this column as: _##level1_##level2_##level3
    (_## being the custom separator)

    this function would parse this column name and prepare a Column object like
    Column(
        name="level1",
        dataType="RECORD",
        children=[
            Column(
                name="level2",
                dataType="RECORD",
                children=[
                    Column(
                        name="level3",
                        dataType="INT",
                    )
                ]
            )
        ]
    )
    """
    try:
        # pylint: disable=bad-str-strip-call
        column_name = str(column).strip(COMPLEX_COLUMN_SEPARATOR)
        col_hierarchy = tuple(column_name.split(COMPLEX_COLUMN_SEPARATOR))
        parent_col: Optional[Column] = None
        root_col: Optional[Column] = None

        # here we are only processing col_hierarchy till [:-1]
        # because all the column/node before -1 would be treated
        # as a record and the column at -1 would be the column
        # having a primitive datatype
        # for example if col_hierarchy is ("image", "properties", "size")
        # then image would be the record having child properties which is
        # also a record  but the "size" will not be handled in this loop
        # as it will be of primitive type for ex. int
        for index, col_name in enumerate(col_hierarchy[:-1]):

            if complex_col_dict.get(col_hierarchy[: index + 1]):
                # if we have already seen this column fetch that column
                parent_col = complex_col_dict.get(col_hierarchy[: index + 1])
            else:
                # if we have not seen this column than create the column and
                # append to the parent if available
                intermediate_column = Column(
                    name=truncate_column_name(col_name),
                    displayName=col_name,
                    dataType=DataType.RECORD,
                    children=[],
                    dataTypeDisplay=DataType.RECORD.value,
                )
                if parent_col:
                    parent_col.children.append(intermediate_column)
                    root_col = parent_col
                parent_col = intermediate_column
                complex_col_dict[col_hierarchy[: index + 1]] = parent_col

        # prepare the leaf node
        # use String as default type
        data_type = DataType.STRING
        if hasattr(data_frame[column], "dtypes"):
            data_type = fetch_col_types(data_frame, column_name=column)

        leaf_column = Column(
            name=col_hierarchy[-1],
            dataType=data_type,
            dataTypeDisplay=data_type.value,
            arrayDataType=DataType.UNKNOWN if data_type == DataType.ARRAY else None,
        )

        parent_col.children.append(leaf_column)

        # finally add the top level node in the column list
        if col_hierarchy[0] not in processed_complex_columns:
            processed_complex_columns.add(col_hierarchy[0])
            final_column_list.append(root_col or parent_col)
    except Exception as exc:
        logger.debug(traceback.format_exc())
        logger.warning(f"Unexpected exception parsing column [{column}]: {exc}")


def get_columns(data_frame: "DataFrame"):
    """
    method to process column details
    """
    data_frame = clean_dataframe(data_frame)
    cols = []
    complex_col_dict = {}

    processed_complex_columns = set()
    if hasattr(data_frame, "columns"):
        df_columns = list(data_frame.columns)
        for column in df_columns:
            if COMPLEX_COLUMN_SEPARATOR in column:
                _parse_complex_column(
                    data_frame,
                    column,
                    cols,
                    complex_col_dict,
                    processed_complex_columns,
                )
            else:
                # use String by default
                data_type = DataType.STRING
                try:
                    if hasattr(data_frame[column], "dtypes"):
                        data_type = fetch_col_types(data_frame, column_name=column)

                    parsed_string = {
                        "dataTypeDisplay": data_type.value,
                        "dataType": data_type,
                        "name": truncate_column_name(column),
                        "displayName": column,
                    }
                    if data_type == DataType.ARRAY:
                        parsed_string["arrayDataType"] = DataType.UNKNOWN

                    cols.append(Column(**parsed_string))
                except Exception as exc:
                    logger.debug(traceback.format_exc())
                    logger.warning(
                        f"Unexpected exception parsing column [{column}]: {exc}"
                    )
    complex_col_dict.clear()
    return cols


def fetch_col_types(data_frame, column_name):
    """fetch_col_types: Fetch Column Type for the c

    Args:
        data_frame (DataFrame)
        column_name (string)
    """
    try:
        data_type = None
        if data_frame[column_name].dtypes.name == "object" and any(
            data_frame[column_name].dropna().values
        ):
            try:
                # Safely evaluate the input string
                df_row_val = data_frame[column_name].dropna().values[0]
                parsed_object = ast.literal_eval(df_row_val)
                # Determine the data type of the parsed object
                data_type = type(parsed_object).__name__.lower()
            except (ValueError, SyntaxError):
                # Handle any exceptions that may occur
                data_type = "string"

        data_type = DATALAKE_DATA_TYPES.get(
            data_type or data_frame[column_name].dtypes.name, DataType.STRING
        )
    except Exception as err:
        logger.warning(
            f"Failed to distinguish data type for column {column_name}, Falling back to {data_type}, exc: {err}"
        )
        logger.debug(traceback.format_exc())
    return data_type
Fixes 10949: return Chunks for file formats & Centralize logic for different auth configs (#11639) * Centralize Auth and File formats datalake 2023-05-19 18:54:28 +05:30			`# Copyright 2021 Collate`
			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`

			`"""`
			`Module to define helper methods for datalake and to fetch data and metadata`
			`from different auths and different file systems.`
			`"""`
Datalake: Add manifest file support, fix profiler metrics, add array and json column type support (#13017) 2023-09-13 15:15:49 +05:30			`import ast`
			`import traceback`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`from typing import List, Optional`
Fixes 10949: return Chunks for file formats & Centralize logic for different auth configs (#11639) * Centralize Auth and File formats datalake 2023-05-19 18:54:28 +05:30
Datalake: Add manifest file support, fix profiler metrics, add array and json column type support (#13017) 2023-09-13 15:15:49 +05:30			`from metadata.generated.schema.entity.data.table import Column, DataType`
			`from metadata.ingestion.source.database.column_helpers import truncate_column_name`
			`from metadata.ingestion.source.database.datalake.columns import clean_dataframe`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`from metadata.readers.dataframe.models import (`
			`DatalakeColumnWrapper,`
Fixes 10949: return Chunks for file formats & Centralize logic for different auth configs (#11639) * Centralize Auth and File formats datalake 2023-05-19 18:54:28 +05:30			`DatalakeTableSchemaWrapper,`
			`)`
Datalake: Add manifest file support, fix profiler metrics, add array and json column type support (#13017) 2023-09-13 15:15:49 +05:30			`from metadata.readers.dataframe.reader_factory import SupportedTypes, get_df_reader`
			`from metadata.utils.constants import COMPLEX_COLUMN_SEPARATOR`
Fixes 10949: return Chunks for file formats & Centralize logic for different auth configs (#11639) * Centralize Auth and File formats datalake 2023-05-19 18:54:28 +05:30			`from metadata.utils.logger import utils_logger`

			`logger = utils_logger()`

Datalake: Add manifest file support, fix profiler metrics, add array and json column type support (#13017) 2023-09-13 15:15:49 +05:30			`DATALAKE_DATA_TYPES = {`
			`**dict.fromkeys(["int64", "int", "int32"], DataType.INT),`
			`"dict": DataType.JSON,`
			`"list": DataType.ARRAY,`
			`**dict.fromkeys(["float64", "float32", "float"], DataType.FLOAT),`
			`"bool": DataType.BOOLEAN,`
			`**dict.fromkeys(`
			`["datetime64", "timedelta[ns]", "datetime64[ns]"], DataType.DATETIME`
			`),`
			`"str": DataType.STRING,`
			`}`

Fixes 10949: return Chunks for file formats & Centralize logic for different auth configs (#11639) * Centralize Auth and File formats datalake 2023-05-19 18:54:28 +05:30
			`def fetch_dataframe(`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`config_source,`
			`client,`
			`file_fqn: DatalakeTableSchemaWrapper,`
			`**kwargs,`
			`) -> Optional[List["DataFrame"]]:`
Fixes 10949: return Chunks for file formats & Centralize logic for different auth configs (#11639) * Centralize Auth and File formats datalake 2023-05-19 18:54:28 +05:30			`"""`
			`Method to get dataframe for profiling`
			`"""`
			`# dispatch to handle fetching of data from multiple file formats (csv, tsv, json, avro and parquet)`
			`key: str = file_fqn.key`
			`bucket_name: str = file_fqn.bucket_name`
			`try:`
Datalake: Add manifest file support, fix profiler metrics, add array and json column type support (#13017) 2023-09-13 15:15:49 +05:30			`file_extension: Optional[SupportedTypes] = file_fqn.file_extension or next(`
			`supported_type or None`
			`for supported_type in SupportedTypes`
			`if key.endswith(supported_type.value)`
			`)`
			`if file_extension and not key.endswith("/"):`
			`df_reader = get_df_reader(`
			`type_=file_extension,`
			`config_source=config_source,`
			`client=client,`
			`)`
			`try:`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`df_wrapper: DatalakeColumnWrapper = df_reader.read(`
			`key=key, bucket_name=bucket_name, **kwargs`
			`)`
			`return df_wrapper.dataframes`
Datalake: Add manifest file support, fix profiler metrics, add array and json column type support (#13017) 2023-09-13 15:15:49 +05:30			`except Exception as err:`
			`logger.error(`
			`f"Error fetching file [{bucket_name}/{key}] using "`
			`f"[{config_source.__class__.__name__}] due to: [{err}]"`
			`)`
Fixes 10949: return Chunks for file formats & Centralize logic for different auth configs (#11639) * Centralize Auth and File formats datalake 2023-05-19 18:54:28 +05:30			`except Exception as err:`
			`logger.error(`
Fix #12770 - Cleanup DL structure & Readers & Python 3.8 (#12776) 2023-08-09 12:37:16 +02:00			`f"Error fetching file [{bucket_name}/{key}] using [{config_source.__class__.__name__}] due to: [{err}]"`
Fixes 10949: return Chunks for file formats & Centralize logic for different auth configs (#11639) * Centralize Auth and File formats datalake 2023-05-19 18:54:28 +05:30			`)`
Fixes #12555 - Fix DL test suite (#12727) * Fix DL test suite * Fix linting 2023-08-03 11:48:22 +02:00			`# Here we need to blow things up. Without the dataframe we cannot move forward`
			`raise err`

Fixes 10949: return Chunks for file formats & Centralize logic for different auth configs (#11639) * Centralize Auth and File formats datalake 2023-05-19 18:54:28 +05:30			`return None`
Fix #6700: Add support for table properties: file format for datalake (#12920) * Fix #6700: Add support for table properties: file format for datalake & storage * pylint fix * resolve review comments 2023-08-22 13:16:22 +05:30

Datalake: Add manifest file support, fix profiler metrics, add array and json column type support (#13017) 2023-09-13 15:15:49 +05:30			`def get_file_format_type(key_name, metadata_entry=None):`
			`for supported_types in SupportedTypes:`
			`if key_name.endswith(supported_types.value):`
			`return supported_types`
			`if metadata_entry:`
			`entry: list = [`
			`entry for entry in metadata_entry.entries if key_name == entry.dataPath`
			`]`
			`if entry and supported_types.value == entry[0].structureFormat:`
			`return supported_types`
			`return False`


			`def _parse_complex_column(`
			`data_frame,`
			`column,`
			`final_column_list: List[Column],`
			`complex_col_dict: dict,`
			`processed_complex_columns: set,`
			`) -> None:`
Fix #6700: Add support for table properties: file format for datalake (#12920) * Fix #6700: Add support for table properties: file format for datalake & storage * pylint fix * resolve review comments 2023-08-22 13:16:22 +05:30			`"""`
Datalake: Add manifest file support, fix profiler metrics, add array and json column type support (#13017) 2023-09-13 15:15:49 +05:30			`This class parses the complex columns`

			`for example consider this data:`
			`{`
			`"level1": {`
			`"level2":{`
			`"level3": 1`
			`}`
			`}`
			`}`

			`pandas would name this column as: _##level1_##level2_##level3`
			`(_## being the custom separator)`

			`this function would parse this column name and prepare a Column object like`
			`Column(`
			`name="level1",`
			`dataType="RECORD",`
			`children=[`
			`Column(`
			`name="level2",`
			`dataType="RECORD",`
			`children=[`
			`Column(`
			`name="level3",`
			`dataType="INT",`
			`)`
			`]`
			`)`
			`]`
			`)`
Fix #6700: Add support for table properties: file format for datalake (#12920) * Fix #6700: Add support for table properties: file format for datalake & storage * pylint fix * resolve review comments 2023-08-22 13:16:22 +05:30			`"""`
Datalake: Add manifest file support, fix profiler metrics, add array and json column type support (#13017) 2023-09-13 15:15:49 +05:30			`try:`
			`# pylint: disable=bad-str-strip-call`
			`column_name = str(column).strip(COMPLEX_COLUMN_SEPARATOR)`
			`col_hierarchy = tuple(column_name.split(COMPLEX_COLUMN_SEPARATOR))`
			`parent_col: Optional[Column] = None`
			`root_col: Optional[Column] = None`
Fix #6700: Add support for table properties: file format for datalake (#12920) * Fix #6700: Add support for table properties: file format for datalake & storage * pylint fix * resolve review comments 2023-08-22 13:16:22 +05:30
Datalake: Add manifest file support, fix profiler metrics, add array and json column type support (#13017) 2023-09-13 15:15:49 +05:30			`# here we are only processing col_hierarchy till [:-1]`
			`# because all the column/node before -1 would be treated`
			`# as a record and the column at -1 would be the column`
			`# having a primitive datatype`
			`# for example if col_hierarchy is ("image", "properties", "size")`
			`# then image would be the record having child properties which is`
			`# also a record but the "size" will not be handled in this loop`
			`# as it will be of primitive type for ex. int`
			`for index, col_name in enumerate(col_hierarchy[:-1]):`

			`if complex_col_dict.get(col_hierarchy[: index + 1]):`
			`# if we have already seen this column fetch that column`
			`parent_col = complex_col_dict.get(col_hierarchy[: index + 1])`
			`else:`
			`# if we have not seen this column than create the column and`
			`# append to the parent if available`
			`intermediate_column = Column(`
			`name=truncate_column_name(col_name),`
			`displayName=col_name,`
			`dataType=DataType.RECORD,`
			`children=[],`
			`dataTypeDisplay=DataType.RECORD.value,`
			`)`
			`if parent_col:`
			`parent_col.children.append(intermediate_column)`
			`root_col = parent_col`
			`parent_col = intermediate_column`
			`complex_col_dict[col_hierarchy[: index + 1]] = parent_col`

			`# prepare the leaf node`
			`# use String as default type`
			`data_type = DataType.STRING`
			`if hasattr(data_frame[column], "dtypes"):`
			`data_type = fetch_col_types(data_frame, column_name=column)`

			`leaf_column = Column(`
			`name=col_hierarchy[-1],`
			`dataType=data_type,`
			`dataTypeDisplay=data_type.value,`
			`arrayDataType=DataType.UNKNOWN if data_type == DataType.ARRAY else None,`
			`)`

			`parent_col.children.append(leaf_column)`

			`# finally add the top level node in the column list`
			`if col_hierarchy[0] not in processed_complex_columns:`
			`processed_complex_columns.add(col_hierarchy[0])`
			`final_column_list.append(root_col or parent_col)`
			`except Exception as exc:`
			`logger.debug(traceback.format_exc())`
			`logger.warning(f"Unexpected exception parsing column [{column}]: {exc}")`


			`def get_columns(data_frame: "DataFrame"):`
			`"""`
			`method to process column details`
			`"""`
			`data_frame = clean_dataframe(data_frame)`
			`cols = []`
			`complex_col_dict = {}`

			`processed_complex_columns = set()`
			`if hasattr(data_frame, "columns"):`
			`df_columns = list(data_frame.columns)`
			`for column in df_columns:`
			`if COMPLEX_COLUMN_SEPARATOR in column:`
			`_parse_complex_column(`
			`data_frame,`
			`column,`
			`cols,`
			`complex_col_dict,`
			`processed_complex_columns,`
			`)`
			`else:`
			`# use String by default`
			`data_type = DataType.STRING`
			`try:`
			`if hasattr(data_frame[column], "dtypes"):`
			`data_type = fetch_col_types(data_frame, column_name=column)`

			`parsed_string = {`
			`"dataTypeDisplay": data_type.value,`
			`"dataType": data_type,`
			`"name": truncate_column_name(column),`
			`"displayName": column,`
			`}`
			`if data_type == DataType.ARRAY:`
			`parsed_string["arrayDataType"] = DataType.UNKNOWN`

			`cols.append(Column(**parsed_string))`
			`except Exception as exc:`
			`logger.debug(traceback.format_exc())`
			`logger.warning(`
			`f"Unexpected exception parsing column [{column}]: {exc}"`
			`)`
			`complex_col_dict.clear()`
			`return cols`


			`def fetch_col_types(data_frame, column_name):`
			`"""fetch_col_types: Fetch Column Type for the c`

			`Args:`
			`data_frame (DataFrame)`
			`column_name (string)`
			`"""`
			`try:`
			`data_type = None`
			`if data_frame[column_name].dtypes.name == "object" and any(`
			`data_frame[column_name].dropna().values`
			`):`
			`try:`
			`# Safely evaluate the input string`
			`df_row_val = data_frame[column_name].dropna().values[0]`
			`parsed_object = ast.literal_eval(df_row_val)`
			`# Determine the data type of the parsed object`
			`data_type = type(parsed_object).__name__.lower()`
			`except (ValueError, SyntaxError):`
			`# Handle any exceptions that may occur`
			`data_type = "string"`

			`data_type = DATALAKE_DATA_TYPES.get(`
			`data_type or data_frame[column_name].dtypes.name, DataType.STRING`
			`)`
			`except Exception as err:`
			`logger.warning(`
			`f"Failed to distinguish data type for column {column_name}, Falling back to {data_type}, exc: {err}"`
			`)`
			`logger.debug(traceback.format_exc())`
			`return data_type`