280 lines
10 KiB
Python
Raw Normal View History

# Copyright 2021 Collate
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Module to define helper methods for datalake and to fetch data and metadata
from different auths and different file systems.
"""
import ast
import traceback
from typing import List, Optional
from metadata.generated.schema.entity.data.table import Column, DataType
from metadata.ingestion.source.database.column_helpers import truncate_column_name
from metadata.ingestion.source.database.datalake.columns import clean_dataframe
from metadata.readers.dataframe.models import (
DatalakeColumnWrapper,
DatalakeTableSchemaWrapper,
)
from metadata.readers.dataframe.reader_factory import SupportedTypes, get_df_reader
from metadata.utils.constants import COMPLEX_COLUMN_SEPARATOR
from metadata.utils.logger import utils_logger
logger = utils_logger()
DATALAKE_DATA_TYPES = {
**dict.fromkeys(["int64", "int", "int32"], DataType.INT),
"dict": DataType.JSON,
"list": DataType.ARRAY,
**dict.fromkeys(["float64", "float32", "float"], DataType.FLOAT),
"bool": DataType.BOOLEAN,
**dict.fromkeys(
["datetime64", "timedelta[ns]", "datetime64[ns]"], DataType.DATETIME
),
"str": DataType.STRING,
}
def fetch_dataframe(
config_source,
client,
file_fqn: DatalakeTableSchemaWrapper,
**kwargs,
) -> Optional[List["DataFrame"]]:
"""
Method to get dataframe for profiling
"""
# dispatch to handle fetching of data from multiple file formats (csv, tsv, json, avro and parquet)
key: str = file_fqn.key
bucket_name: str = file_fqn.bucket_name
try:
file_extension: Optional[SupportedTypes] = file_fqn.file_extension or next(
supported_type or None
for supported_type in SupportedTypes
if key.endswith(supported_type.value)
)
if file_extension and not key.endswith("/"):
df_reader = get_df_reader(
type_=file_extension,
config_source=config_source,
client=client,
)
try:
df_wrapper: DatalakeColumnWrapper = df_reader.read(
key=key, bucket_name=bucket_name, **kwargs
)
return df_wrapper.dataframes
except Exception as err:
logger.error(
f"Error fetching file [{bucket_name}/{key}] using "
f"[{config_source.__class__.__name__}] due to: [{err}]"
)
except Exception as err:
logger.error(
f"Error fetching file [{bucket_name}/{key}] using [{config_source.__class__.__name__}] due to: [{err}]"
)
# Here we need to blow things up. Without the dataframe we cannot move forward
raise err
return None
def get_file_format_type(key_name, metadata_entry=None):
for supported_types in SupportedTypes:
if key_name.endswith(supported_types.value):
return supported_types
if metadata_entry:
entry: list = [
entry for entry in metadata_entry.entries if key_name == entry.dataPath
]
if entry and supported_types.value == entry[0].structureFormat:
return supported_types
return False
def _parse_complex_column(
data_frame,
column,
final_column_list: List[Column],
complex_col_dict: dict,
processed_complex_columns: set,
) -> None:
"""
This class parses the complex columns
for example consider this data:
{
"level1": {
"level2":{
"level3": 1
}
}
}
pandas would name this column as: _##level1_##level2_##level3
(_## being the custom separator)
this function would parse this column name and prepare a Column object like
Column(
name="level1",
dataType="RECORD",
children=[
Column(
name="level2",
dataType="RECORD",
children=[
Column(
name="level3",
dataType="INT",
)
]
)
]
)
"""
try:
# pylint: disable=bad-str-strip-call
column_name = str(column).strip(COMPLEX_COLUMN_SEPARATOR)
col_hierarchy = tuple(column_name.split(COMPLEX_COLUMN_SEPARATOR))
parent_col: Optional[Column] = None
root_col: Optional[Column] = None
# here we are only processing col_hierarchy till [:-1]
# because all the column/node before -1 would be treated
# as a record and the column at -1 would be the column
# having a primitive datatype
# for example if col_hierarchy is ("image", "properties", "size")
# then image would be the record having child properties which is
# also a record but the "size" will not be handled in this loop
# as it will be of primitive type for ex. int
for index, col_name in enumerate(col_hierarchy[:-1]):
if complex_col_dict.get(col_hierarchy[: index + 1]):
# if we have already seen this column fetch that column
parent_col = complex_col_dict.get(col_hierarchy[: index + 1])
else:
# if we have not seen this column than create the column and
# append to the parent if available
intermediate_column = Column(
name=truncate_column_name(col_name),
displayName=col_name,
dataType=DataType.RECORD,
children=[],
dataTypeDisplay=DataType.RECORD.value,
)
if parent_col:
parent_col.children.append(intermediate_column)
root_col = parent_col
parent_col = intermediate_column
complex_col_dict[col_hierarchy[: index + 1]] = parent_col
# prepare the leaf node
# use String as default type
data_type = DataType.STRING
if hasattr(data_frame[column], "dtypes"):
data_type = fetch_col_types(data_frame, column_name=column)
leaf_column = Column(
name=col_hierarchy[-1],
dataType=data_type,
dataTypeDisplay=data_type.value,
arrayDataType=DataType.UNKNOWN if data_type == DataType.ARRAY else None,
)
parent_col.children.append(leaf_column)
# finally add the top level node in the column list
if col_hierarchy[0] not in processed_complex_columns:
processed_complex_columns.add(col_hierarchy[0])
final_column_list.append(root_col or parent_col)
except Exception as exc:
logger.debug(traceback.format_exc())
logger.warning(f"Unexpected exception parsing column [{column}]: {exc}")
def get_columns(data_frame: "DataFrame"):
"""
method to process column details
"""
data_frame = clean_dataframe(data_frame)
cols = []
complex_col_dict = {}
processed_complex_columns = set()
if hasattr(data_frame, "columns"):
df_columns = list(data_frame.columns)
for column in df_columns:
if COMPLEX_COLUMN_SEPARATOR in column:
_parse_complex_column(
data_frame,
column,
cols,
complex_col_dict,
processed_complex_columns,
)
else:
# use String by default
data_type = DataType.STRING
try:
if hasattr(data_frame[column], "dtypes"):
data_type = fetch_col_types(data_frame, column_name=column)
parsed_string = {
"dataTypeDisplay": data_type.value,
"dataType": data_type,
"name": truncate_column_name(column),
"displayName": column,
}
if data_type == DataType.ARRAY:
parsed_string["arrayDataType"] = DataType.UNKNOWN
cols.append(Column(**parsed_string))
except Exception as exc:
logger.debug(traceback.format_exc())
logger.warning(
f"Unexpected exception parsing column [{column}]: {exc}"
)
complex_col_dict.clear()
return cols
def fetch_col_types(data_frame, column_name):
"""fetch_col_types: Fetch Column Type for the c
Args:
data_frame (DataFrame)
column_name (string)
"""
try:
data_type = None
if data_frame[column_name].dtypes.name == "object" and any(
data_frame[column_name].dropna().values
):
try:
# Safely evaluate the input string
df_row_val = data_frame[column_name].dropna().values[0]
parsed_object = ast.literal_eval(df_row_val)
# Determine the data type of the parsed object
data_type = type(parsed_object).__name__.lower()
except (ValueError, SyntaxError):
# Handle any exceptions that may occur
data_type = "string"
data_type = DATALAKE_DATA_TYPES.get(
data_type or data_frame[column_name].dtypes.name, DataType.STRING
)
except Exception as err:
logger.warning(
f"Failed to distinguish data type for column {column_name}, Falling back to {data_type}, exc: {err}"
)
logger.debug(traceback.format_exc())
return data_type