Fix #14895 - Get Metadata from Parquet Schema (#14956)

* linting: fix python linting

* fix: get column types from parquet schema for parquet files

* style: python linting

* fix: remove displayType check in test as variation depending on OS
This commit is contained in:
Teddy 2024-02-01 09:02:52 +01:00 committed by GitHub
parent 6838fadec6
commit 9a4a9df836
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
62 changed files with 686 additions and 218 deletions

View File

@ -54,7 +54,6 @@ class TestCaseRunner(Processor):
"""Execute the test suite tests and create test cases from the YAML config"""
def __init__(self, config: OpenMetadataWorkflowConfig, metadata: OpenMetadata):
super().__init__()
self.config = config

View File

@ -149,7 +149,6 @@ class TestSuiteSource(Source):
)
else:
test_suite_cases = self._get_test_cases_from_test_suite(table.testSuite)
yield Either(

View File

@ -17,7 +17,7 @@ from typing import Optional
from metadata.profiler.metrics.core import add_props
from metadata.profiler.metrics.registry import Metrics
from metadata.utils.datalake.datalake_utils import fetch_col_types
from metadata.utils.datalake.datalake_utils import GenericDataFrameColumnParser
from metadata.utils.entity_link import get_decoded_column
from metadata.utils.sqa_like_column import SQALikeColumn
@ -28,7 +28,9 @@ class PandasValidatorMixin:
def get_column_name(self, entity_link: str, dfs) -> SQALikeColumn:
# we'll use the first dataframe chunk to get the column name.
column = dfs[0][get_decoded_column(entity_link)]
_type = fetch_col_types(dfs[0], get_decoded_column(entity_link))
_type = GenericDataFrameColumnParser.fetch_col_types(
dfs[0], get_decoded_column(entity_link)
)
sqa_like_column = SQALikeColumn(
name=column.name,
type=_type,

View File

@ -389,7 +389,6 @@ class LineageParser:
def _evaluate_best_parser(
query: str, dialect: Dialect, timeout_seconds: int
) -> Optional[LineageRunner]:
if query is None:
return None

View File

@ -210,9 +210,9 @@ def _determine_restricted_operation(
Only retain add operation for restrict_update_fields fields
"""
path = patch_ops.get("path")
op = patch_ops.get("op")
ops = patch_ops.get("op")
for field in restrict_update_fields or []:
if field in path and op != PatchOperation.ADD.value:
if field in path and ops != PatchOperation.ADD.value:
return False
return True

View File

@ -135,7 +135,6 @@ class OMetaPatchMixin(OMetaPatchMixinBase):
Updated Entity
"""
try:
patch = build_patch(
source=source,
destination=destination,

View File

@ -531,7 +531,6 @@ class DashboardServiceSource(TopologyRunnerMixin, Source, ABC):
)
def check_database_schema_name(self, database_schema_name: str):
"""
Check if the input database schema name is equal to "<default>" and return the input name if it is not.

View File

@ -833,7 +833,6 @@ class LookerSource(DashboardServiceSource):
to_entity.id.__root__
not in self._added_lineage[from_entity.id.__root__]
):
self._added_lineage[from_entity.id.__root__].append(
to_entity.id.__root__
)
@ -943,7 +942,6 @@ class LookerSource(DashboardServiceSource):
dashboard_name = self.context.dashboard
try:
dashboard_fqn = fqn.build(
metadata=self.metadata,
entity_type=Dashboard,

View File

@ -192,7 +192,6 @@ class SupersetAPISource(SupersetSourceMixin):
def yield_datamodel(
self, dashboard_details: DashboardResult
) -> Iterable[Either[CreateDashboardDataModelRequest]]:
if self.source_config.includeDataModels:
for chart_id in self._get_charts_of_dashboard(dashboard_details):
try:

View File

@ -216,7 +216,6 @@ class SupersetDBSource(SupersetSourceMixin):
def yield_datamodel(
self, dashboard_details: FetchDashboard
) -> Iterable[Either[CreateDashboardDataModelRequest]]:
if self.source_config.includeDataModels:
for chart_id in self._get_charts_of_dashboard(dashboard_details):
chart_json = self.all_charts.get(chart_id)

View File

@ -77,7 +77,6 @@ class AzuresqlSource(CommonDbSourceService, MultiDBSource):
yield from self._execute_database_query(AZURE_SQL_GET_DATABASES)
def get_database_names(self) -> Iterable[str]:
if not self.config.serviceConnection.__root__.config.ingestAllDatabases:
configured_db = self.config.serviceConnection.__root__.config.database
self.set_inspector(database_name=configured_db)

View File

@ -46,7 +46,7 @@ from metadata.ingestion.source.database.database_service import DatabaseServiceS
from metadata.ingestion.source.database.stored_procedures_mixin import QueryByProcedure
from metadata.utils import fqn
from metadata.utils.constants import DEFAULT_DATABASE
from metadata.utils.datalake.datalake_utils import get_columns
from metadata.utils.datalake.datalake_utils import DataFrameColumnParser
from metadata.utils.filters import filter_by_schema, filter_by_table
from metadata.utils.logger import ingestion_logger
@ -217,7 +217,8 @@ class CommonNoSQLSource(DatabaseServiceSource, ABC):
try:
data = self.get_table_columns_dict(schema_name, table_name)
df = pd.DataFrame.from_records(list(data))
columns = get_columns(df)
column_parser = DataFrameColumnParser.create(df)
columns = column_parser.get_columns()
table_request = CreateTableRequest(
name=table_name,
tableType=table_type,

View File

@ -94,7 +94,6 @@ class DatabricksClient:
Method returns List the history of queries through SQL warehouses
"""
try:
data = {}
daydiff = end_date - start_date

View File

@ -77,8 +77,8 @@ from metadata.utils import fqn
from metadata.utils.constants import DEFAULT_DATABASE
from metadata.utils.credentials import GOOGLE_CREDENTIALS
from metadata.utils.datalake.datalake_utils import (
DataFrameColumnParser,
fetch_dataframe,
get_columns,
get_file_format_type,
)
from metadata.utils.filters import filter_by_database, filter_by_schema, filter_by_table
@ -416,9 +416,14 @@ class DatalakeSource(DatabaseServiceSource):
file_extension=table_extension,
),
)
# If no data_frame (due to unsupported type), ignore
columns = get_columns(data_frame[0]) if data_frame else None
if data_frame:
column_parser = DataFrameColumnParser.create(
data_frame[0], table_extension
)
columns = column_parser.get_columns()
else:
# If no data_frame (due to unsupported type), ignore
columns = None
if columns:
table_request = CreateTableRequest(
name=table_name,

View File

@ -692,7 +692,6 @@ class DbtSource(DbtServiceSource):
)
if table_entity:
try:
service_name, database_name, schema_name, table_name = fqn.split(
table_entity.fullyQualifiedName.__root__
)

View File

@ -126,7 +126,6 @@ def get_metastore_connection(connection: Any) -> Engine:
@get_metastore_connection.register
def _(connection: PostgresConnection):
# import required to load sqlalchemy plugin
# pylint: disable=import-outside-toplevel,unused-import
from metadata.ingestion.source.database.hive.metastore_dialects.postgres import ( # nopycln: import
@ -153,7 +152,6 @@ def _(connection: PostgresConnection):
@get_metastore_connection.register
def _(connection: MysqlConnection):
# import required to load sqlalchemy plugin
# pylint: disable=import-outside-toplevel,unused-import
from metadata.ingestion.source.database.hive.metastore_dialects.mysql import ( # nopycln: import

View File

@ -114,7 +114,6 @@ class MssqlSource(StoredProcedureMixin, CommonDbSourceService, MultiDBSource):
yield from self._execute_database_query(MSSQL_GET_DATABASE)
def get_database_names(self) -> Iterable[str]:
if not self.config.serviceConnection.__root__.config.ingestAllDatabases:
configured_db = self.config.serviceConnection.__root__.config.database
self.set_inspector(database_name=configured_db)

View File

@ -62,7 +62,6 @@ def get_view_definition(
dblink="",
**kw,
):
return get_view_definition_wrapper(
self,
connection,

View File

@ -75,7 +75,6 @@ def get_lineage_from_multi_tenant_table(
connection: any,
service_name: str,
) -> Iterator[Either[AddLineageRequest]]:
"""
For PGSpider, firstly, get list of multi-tenant tables.
Next, get child foreign tables of each multi-tenant tables.

View File

@ -800,7 +800,6 @@ class SampleDataSource(
# Create table and stored procedure lineage
for lineage_entities in self.stored_procedures["lineage"]:
from_table = self.metadata.get_by_name(
entity=Table, fqn=lineage_entities["from_table_fqn"]
)

View File

@ -465,7 +465,6 @@ class SasSource(
or table_entity.extension.__root__.get("analysisTimeStamp")
!= table_extension.get("analysisTimeStamp")
):
# create the columns of the table
columns, col_profile_list = self.create_columns_and_profiles(
col_entity_instances, table_entity_instance
@ -711,10 +710,10 @@ class SasSource(
if "state" in table_resource and table_resource["state"] == "unloaded":
self.sas_client.load_table(table_uri + "/state?value=loaded")
except HTTPError as e:
except HTTPError as exc:
# append http error to table description if it can't be found
logger.error(f"table_uri: {table_uri}")
self.report_description.append(str(e))
self.report_description.append(str(exc))
name_index = table_uri.rindex("/")
table_name = table_uri[name_index + 1 :]
param = f"filter=eq(name,'{table_name}')"

View File

@ -39,7 +39,6 @@ from metadata.utils.sqlalchemy_utils import (
def _quoted_name(entity_name: Optional[str]) -> Optional[str]:
if entity_name:
return fqn.quote_name(entity_name)

View File

@ -153,7 +153,6 @@ class StoredProcedureMixin(ABC):
query_type=query_by_procedure.query_type,
query_text=query_by_procedure.query_text,
):
self.context.stored_procedure_query_lineage = True
for either_lineage in get_lineage_by_query(
self.metadata,

View File

@ -246,7 +246,6 @@ class CommonBrokerSource(MessagingServiceSource, ABC):
if messages:
for message in messages:
try:
value = message.value()
sample_data.append(
self.decode_message(

View File

@ -131,7 +131,6 @@ class SplineSource(PipelineServiceSource):
return None
def _get_table_from_datasource_name(self, datasource: str) -> Optional[Table]:
if (
not datasource
and not datasource.startswith("dbfs")

View File

@ -111,7 +111,6 @@ class ElasticsearchSource(SearchServiceSource):
Method to Get Sample Data of Search Index Entity
"""
if self.source_config.includeSampleData and self.context.search_index:
sample_data = self.client.search(
index=self.context.search_index,
q=WILDCARD_SEARCH,

View File

@ -14,8 +14,6 @@ Base class for ingesting Object Storage services
from abc import ABC, abstractmethod
from typing import Any, Iterable, List, Optional, Set
from pandas import DataFrame
from metadata.generated.schema.api.data.createContainer import CreateContainerRequest
from metadata.generated.schema.entity.data.container import Container
from metadata.generated.schema.entity.services.storageService import (
@ -53,7 +51,10 @@ from metadata.readers.dataframe.models import DatalakeTableSchemaWrapper
from metadata.readers.dataframe.reader_factory import SupportedTypes
from metadata.readers.models import ConfigSource
from metadata.utils import fqn
from metadata.utils.datalake.datalake_utils import fetch_dataframe, get_columns
from metadata.utils.datalake.datalake_utils import (
DataFrameColumnParser,
fetch_dataframe,
)
from metadata.utils.logger import ingestion_logger
from metadata.utils.storage_metadata_config import (
StorageMetadataConfigException,
@ -67,7 +68,6 @@ OPENMETADATA_TEMPLATE_FILE_NAME = "openmetadata.json"
class StorageServiceTopology(ServiceTopology):
root = TopologyNode(
producer="get_services",
stages=[
@ -271,10 +271,10 @@ class StorageServiceSource(TopologyRunnerMixin, Source, ABC):
),
)
columns = []
if isinstance(data_structure_details, DataFrame):
columns = get_columns(data_structure_details)
if isinstance(data_structure_details, list) and data_structure_details:
columns = get_columns(data_structure_details[0])
column_parser = DataFrameColumnParser.create(
data_structure_details, SupportedTypes(metadata_entry.structureFormat)
)
columns = column_parser.get_columns()
return columns
def _get_columns(

View File

@ -34,7 +34,10 @@ from metadata.profiler.metrics.core import MetricTypes
from metadata.profiler.metrics.registry import Metrics
from metadata.readers.dataframe.models import DatalakeTableSchemaWrapper
from metadata.utils.constants import COMPLEX_COLUMN_SEPARATOR, SAMPLE_DATA_DEFAULT_COUNT
from metadata.utils.datalake.datalake_utils import fetch_col_types, fetch_dataframe
from metadata.utils.datalake.datalake_utils import (
GenericDataFrameColumnParser,
fetch_dataframe,
)
from metadata.utils.logger import profiler_interface_registry_logger
from metadata.utils.sqa_like_column import SQALikeColumn
@ -411,7 +414,9 @@ class PandasProfilerInterface(ProfilerInterface, PandasInterfaceMixin):
sqalike_columns.append(
SQALikeColumn(
column_name,
fetch_col_types(self.complex_dataframe_sample[0], column_name),
GenericDataFrameColumnParser.fetch_col_types(
self.complex_dataframe_sample[0], column_name
),
)
)
return sqalike_columns

View File

@ -30,6 +30,7 @@ from metadata.utils.logger import profiler_logger
logger = profiler_logger()
# pylint: disable=too-many-locals
class Histogram(HybridMetric):
"""

View File

@ -126,7 +126,6 @@ class AbstractTableMetricComputer(ABC):
table: Table,
where_clause: Optional[List[ColumnOperators]] = None,
):
query = select(*columns).select_from(table)
if where_clause:
query = query.where(*where_clause)

View File

@ -40,7 +40,6 @@ class ProfilerProcessor(Processor):
"""
def __init__(self, config: OpenMetadataWorkflowConfig):
super().__init__()
self.config = config
@ -56,7 +55,6 @@ class ProfilerProcessor(Processor):
return "Profiler"
def _run(self, record: ProfilerSourceAndEntity) -> Either[ProfilerResponse]:
profiler_runner: Profiler = record.profiler_source.get_profiler_runner(
record.entity, self.profiler_config
)

View File

@ -217,7 +217,6 @@ class ProfilerSource(ProfilerSourceInterface):
def _get_context_entities(
self, entity: Table
) -> Tuple[DatabaseSchema, Database, DatabaseService]:
schema_entity = None
database_entity = None
db_service = None

View File

@ -29,7 +29,6 @@ def bigquery_type_mapper(_type_map: dict, col: Column):
from sqlalchemy_bigquery import STRUCT
def build_struct(_type_map: dict, col: Column):
structs = []
for child in col.children:
if child.dataType != DataType.STRUCT:

View File

@ -35,7 +35,6 @@ class ApiReader(Reader, ABC):
"""
def __init__(self, credentials: ReadersCredentials):
self._auth_headers = None
self.credentials = credentials

View File

@ -15,8 +15,9 @@ from different auths and different file systems.
"""
import ast
import json
import random
import traceback
from typing import Dict, List, Optional, cast
from typing import Dict, List, Optional, Union, cast
from metadata.generated.schema.entity.data.table import Column, DataType
from metadata.ingestion.source.database.column_helpers import truncate_column_name
@ -29,18 +30,6 @@ from metadata.utils.logger import utils_logger
logger = utils_logger()
DATALAKE_DATA_TYPES = {
**dict.fromkeys(["int64", "int", "int32"], DataType.INT),
"dict": DataType.JSON,
"list": DataType.ARRAY,
**dict.fromkeys(["float64", "float32", "float"], DataType.FLOAT),
"bool": DataType.BOOLEAN,
**dict.fromkeys(
["datetime64", "timedelta[ns]", "datetime64[ns]"], DataType.DATETIME
),
"str": DataType.STRING,
}
def fetch_dataframe(
config_source,
@ -100,135 +89,379 @@ def get_file_format_type(key_name, metadata_entry=None):
return False
def unique_json_structure(dicts: List[Dict]) -> Dict:
"""Given a sample of `n` json objects, return a json object that represents the unique structure of all `n` objects.
Note that the type of the key will be that of the last object seen in the sample.
# pylint: disable=import-outside-toplevel
class DataFrameColumnParser:
"""A column parser object. This serves as a Creator class for the appropriate column parser object parser
for datalake types. It allows us to implement different schema parsers for different datalake types without
implementing many conditionals statements.
Args:
dicts: list of json objects
e.g. if we want to implement a column parser for parquet files, we can simply implement a
ParquetDataFrameColumnParser class and add it as part of the `create` method. The `create` method will then return
the appropriate parser based on the file type. The `ColumnParser` class has a single entry point `get_columns` which
will call the `get_columns` method of the appropriate parser.
"""
result = {}
for dict_ in dicts:
for key, value in dict_.items():
if isinstance(value, dict):
nested_json = result.get(key, {})
# `isinstance(nested_json, dict)` if for a key we first see a non dict value
# but then see a dict value later, we will consider the key to be a dict.
result[key] = unique_json_structure(
[nested_json if isinstance(nested_json, dict) else {}, value]
)
else:
result[key] = value
return result
def __init__(self, parser):
"""Initialize the column parser object"""
self.parser = parser
@classmethod
def create(
cls,
data_frame: "DataFrame",
file_type: Optional[SupportedTypes] = None,
sample: bool = True,
shuffle: bool = False,
):
"""Instantiate a column parser object with the appropriate parser
Args:
data_frame: the dataframe object
file_type: the file type of the dataframe. Will be used to determine the appropriate parser.
sample: whether to sample the dataframe or not if we have a list of dataframes.
If sample is False, we will concatenate the dataframes, which can be cause OOM error for large dataset.
(default: True)
shuffle: whether to shuffle the dataframe list or not if sample is True. (default: False)
"""
data_frame = cls._get_data_frame(data_frame, sample, shuffle)
if file_type == SupportedTypes.PARQUET:
parser = ParquetDataFrameColumnParser(data_frame)
return cls(parser)
parser = GenericDataFrameColumnParser(data_frame)
return cls(parser)
@staticmethod
def _get_data_frame(
data_frame: Union[List["DataFrame"], "DataFrame"], sample: bool, shuffle: bool
):
"""Return the dataframe to use for parsing"""
import pandas as pd
if not isinstance(data_frame, list):
return data_frame
if sample:
if shuffle:
random.shuffle(data_frame)
return data_frame[0]
return pd.concat(data_frame)
def get_columns(self):
"""Get the columns from the parser"""
return self.parser.get_columns()
def construct_json_column_children(json_column: Dict) -> List[Dict]:
"""Construt a dict representation of a Column object
class GenericDataFrameColumnParser:
"""Given a dataframe object, parse the columns and return a list of Column objects.
Args:
json_column: unique json structure of a column
# TODO: We should consider making the function above part of the `GenericDataFrameColumnParser` class
# though we need to do a thorough overview of where they are used to ensure unnecessary coupling.
"""
children = []
for key, value in json_column.items():
column = {}
type_ = type(value).__name__.lower()
column["dataTypeDisplay"] = DATALAKE_DATA_TYPES.get(
type_, DataType.UNKNOWN
).value
column["dataType"] = DATALAKE_DATA_TYPES.get(type_, DataType.UNKNOWN).value
column["name"] = truncate_column_name(key)
column["displayName"] = key
if isinstance(value, dict):
column["children"] = construct_json_column_children(value)
children.append(column)
return children
_data_formats = {
**dict.fromkeys(["int64", "int", "int32"], DataType.INT),
"dict": DataType.JSON,
"list": DataType.ARRAY,
**dict.fromkeys(["float64", "float32", "float"], DataType.FLOAT),
"bool": DataType.BOOLEAN,
**dict.fromkeys(
["datetime64", "timedelta[ns]", "datetime64[ns]"], DataType.DATETIME
),
"str": DataType.STRING,
}
def __init__(self, data_frame: "DataFrame"):
self.data_frame = data_frame
def get_children(json_column) -> List[Dict]:
"""Get children of json column.
def get_columns(self):
"""
method to process column details
"""
return self._get_columns(self.data_frame)
Args:
json_column (pandas.Series): column with 100 sample rows.
Sample rows will be used to infer children.
"""
from pandas import Series # pylint: disable=import-outside-toplevel
@classmethod
def _get_columns(cls, data_frame: "DataFrame"):
"""
method to process column details.
json_column = cast(Series, json_column)
try:
json_column = json_column.apply(json.loads)
except TypeError:
# if values are not strings, we will assume they are already json objects
# based on the read class logic
pass
json_structure = unique_json_structure(json_column.values.tolist())
Note this was move from a function to a class method to bring it closer to the
`GenericDataFrameColumnParser` class. Should be rethought as part of the TODO.
"""
cols = []
if hasattr(data_frame, "columns"):
df_columns = list(data_frame.columns)
for column in df_columns:
# use String by default
data_type = DataType.STRING
try:
if hasattr(data_frame[column], "dtypes"):
data_type = cls.fetch_col_types(data_frame, column_name=column)
return construct_json_column_children(json_structure)
parsed_string = {
"dataTypeDisplay": data_type.value,
"dataType": data_type,
"name": truncate_column_name(column),
"displayName": column,
}
if data_type == DataType.ARRAY:
parsed_string["arrayDataType"] = DataType.UNKNOWN
if data_type == DataType.JSON:
parsed_string["children"] = cls.get_children(
data_frame[column].dropna()[:100]
)
def get_columns(data_frame: "DataFrame"):
"""
method to process column details
"""
cols = []
if hasattr(data_frame, "columns"):
df_columns = list(data_frame.columns)
for column in df_columns:
# use String by default
data_type = DataType.STRING
try:
if hasattr(data_frame[column], "dtypes"):
data_type = fetch_col_types(data_frame, column_name=column)
parsed_string = {
"dataTypeDisplay": data_type.value,
"dataType": data_type,
"name": truncate_column_name(column),
"displayName": column,
}
if data_type == DataType.ARRAY:
parsed_string["arrayDataType"] = DataType.UNKNOWN
if data_type == DataType.JSON:
parsed_string["children"] = get_children(
data_frame[column].dropna()[:100]
cols.append(Column(**parsed_string))
except Exception as exc:
logger.debug(traceback.format_exc())
logger.warning(
f"Unexpected exception parsing column [{column}]: {exc}"
)
return cols
cols.append(Column(**parsed_string))
except Exception as exc:
logger.debug(traceback.format_exc())
logger.warning(f"Unexpected exception parsing column [{column}]: {exc}")
return cols
@classmethod
def fetch_col_types(cls, data_frame, column_name):
"""fetch_col_types: Fetch Column Type for the c
Note this was move from a function to a class method to bring it closer to the
`GenericDataFrameColumnParser` class. Should be rethought as part of the TODO.
Args:
data_frame (DataFrame)
column_name (string)
"""
data_type = None
try:
if data_frame[column_name].dtypes.name == "object" and any(
data_frame[column_name].dropna().values
):
try:
# Safely evaluate the input string
df_row_val = data_frame[column_name].dropna().values[0]
parsed_object = ast.literal_eval(str(df_row_val))
# Determine the data type of the parsed object
data_type = type(parsed_object).__name__.lower()
except (ValueError, SyntaxError):
# Handle any exceptions that may occur
data_type = "string"
data_type = cls._data_formats.get(
data_type or data_frame[column_name].dtypes.name, DataType.STRING
)
except Exception as err:
logger.warning(
f"Failed to distinguish data type for column {column_name}, Falling back to {data_type}, exc: {err}"
)
logger.debug(traceback.format_exc())
return data_type
@classmethod
def unique_json_structure(cls, dicts: List[Dict]) -> Dict:
"""Given a sample of `n` json objects, return a json object that represents the unique
structure of all `n` objects. Note that the type of the key will be that of
the last object seen in the sample.
Args:
dicts: list of json objects
"""
result = {}
for dict_ in dicts:
for key, value in dict_.items():
if isinstance(value, dict):
nested_json = result.get(key, {})
# `isinstance(nested_json, dict)` if for a key we first see a non dict value
# but then see a dict value later, we will consider the key to be a dict.
result[key] = cls.unique_json_structure(
[nested_json if isinstance(nested_json, dict) else {}, value]
)
else:
result[key] = value
return result
@classmethod
def construct_json_column_children(cls, json_column: Dict) -> List[Dict]:
"""Construt a dict representation of a Column object
Args:
json_column: unique json structure of a column
"""
children = []
for key, value in json_column.items():
column = {}
type_ = type(value).__name__.lower()
column["dataTypeDisplay"] = cls._data_formats.get(
type_, DataType.UNKNOWN
).value
column["dataType"] = cls._data_formats.get(type_, DataType.UNKNOWN).value
column["name"] = truncate_column_name(key)
column["displayName"] = key
if isinstance(value, dict):
column["children"] = cls.construct_json_column_children(value)
children.append(column)
return children
@classmethod
def get_children(cls, json_column) -> List[Dict]:
"""Get children of json column.
Args:
json_column (pandas.Series): column with 100 sample rows.
Sample rows will be used to infer children.
"""
from pandas import Series # pylint: disable=import-outside-toplevel
json_column = cast(Series, json_column)
try:
json_column = json_column.apply(json.loads)
except TypeError:
# if values are not strings, we will assume they are already json objects
# based on the read class logic
pass
json_structure = cls.unique_json_structure(json_column.values.tolist())
return cls.construct_json_column_children(json_structure)
def fetch_col_types(data_frame, column_name):
"""fetch_col_types: Fetch Column Type for the c
# pylint: disable=import-outside-toplevel
class ParquetDataFrameColumnParser:
"""Given a dataframe object generated from a parquet file, parse the columns and return a list of Column objects."""
Args:
data_frame (DataFrame)
column_name (string)
"""
data_type = None
try:
if data_frame[column_name].dtypes.name == "object" and any(
data_frame[column_name].dropna().values
import pyarrow as pa
_data_formats = {
**dict.fromkeys(
["int8", "int16", "int32", "int64", "int", pa.DurationType], DataType.INT
),
**dict.fromkeys(["uint8", "uint16", "uint32", "uint64", "uint"], DataType.UINT),
pa.StructType: DataType.STRUCT,
**dict.fromkeys([pa.ListType, pa.LargeListType], DataType.ARRAY),
**dict.fromkeys(
["halffloat", "float32", "float64", "double", "float"], DataType.FLOAT
),
"bool": DataType.BOOLEAN,
**dict.fromkeys(
[
"datetime64",
"timedelta[ns]",
"datetime64[ns]",
"time32[s]",
"time32[ms]",
"time64[ns]",
"time64[us]",
pa.TimestampType,
"date64",
],
DataType.DATETIME,
),
"date32[day]": DataType.DATE,
"string": DataType.STRING,
**dict.fromkeys(
["binary", "large_binary", pa.FixedSizeBinaryType], DataType.BINARY
),
**dict.fromkeys([pa.Decimal128Type, pa.Decimal256Type], DataType.DECIMAL),
}
def __init__(self, data_frame: "DataFrame"):
import pyarrow as pa
self.data_frame = data_frame
self._arrow_table = pa.Table.from_pandas(self.data_frame)
def get_columns(self):
"""
method to process column details for parquet files
"""
import pyarrow as pa
schema: List[pa.Field] = self._arrow_table.schema
columns = []
for column in schema:
parsed_column = {
"dataTypeDisplay": str(column.type),
"dataType": self._get_pq_data_type(column),
"name": truncate_column_name(column.name),
"displayName": column.name,
}
if parsed_column["dataType"] == DataType.ARRAY:
try:
item_field = column.type.value_field
parsed_column["arrayDataType"] = self._get_pq_data_type(item_field)
except AttributeError:
# if the value field is not specified, we will set it to UNKNOWN
parsed_column["arrayDataType"] = DataType.UNKNOWN
if parsed_column["dataType"] == DataType.BINARY:
try:
data_length = type(column.type).byte_width
except AttributeError:
# if the byte width is not specified, we will set it to -1
# following pyarrow convention
data_length = -1
parsed_column["dataLength"] = data_length
if parsed_column["dataType"] == DataType.STRUCT:
parsed_column["children"] = self._get_children(column)
columns.append(Column(**parsed_column))
return columns
def _get_children(self, column):
"""For struct types, get the children of the column
Args:
column (pa.Field): pa column
"""
field_idx = column.type.num_fields
children = []
for idx in range(field_idx):
child = column.type.field(idx)
data_type = self._get_pq_data_type(child)
child_column = {
"dataTypeDisplay": str(child.type),
"dataType": data_type,
"name": truncate_column_name(child.name),
"displayName": child.name,
}
if data_type == DataType.STRUCT:
child_column["children"] = self._get_children(child)
children.append(child_column)
return children
def _get_pq_data_type(self, column):
"""Given a column return the type of the column
Args:
column (pa.Field): pa column
"""
import pyarrow as pa
if isinstance(
column.type,
(
pa.DurationType,
pa.StructType,
pa.ListType,
pa.LargeListType,
pa.TimestampType,
pa.Decimal128Type,
pa.Decimal256Type,
pa.FixedSizeBinaryType,
),
):
try:
# Safely evaluate the input string
df_row_val = data_frame[column_name].dropna().values[0]
parsed_object = ast.literal_eval(str(df_row_val))
# Determine the data type of the parsed object
data_type = type(parsed_object).__name__.lower()
except (ValueError, SyntaxError):
# Handle any exceptions that may occur
data_type = "string"
# the above type can take many shape
# (i.e. pa.ListType(pa.StructType([pa.column("a", pa.int64())])), etc,)
# so we'll use their type to determine the data type
data_type = self._data_formats.get(type(column.type), DataType.UNKNOWN)
else:
# for the other types we need to use their string representation
# to determine the data type as `type(column.type)` will return
# a generic `pyarrow.lib.DataType`
data_type = self._data_formats.get(str(column.type), DataType.UNKNOWN)
data_type = DATALAKE_DATA_TYPES.get(
data_type or data_frame[column_name].dtypes.name, DataType.STRING
)
except Exception as err:
logger.warning(
f"Failed to distinguish data type for column {column_name}, Falling back to {data_type}, exc: {err}"
)
logger.debug(traceback.format_exc())
return data_type
return data_type

View File

@ -63,7 +63,6 @@ def _(provider: SecretsManagerProvider) -> Optional[AWSCredentials]:
@secrets_manager_client_loader.add(SecretsManagerClientLoader.env.value)
def _(provider: SecretsManagerProvider) -> Optional[AWSCredentials]:
if provider in {
SecretsManagerProvider.aws,
SecretsManagerProvider.managed_aws,

View File

@ -104,7 +104,6 @@ class SecretsManagerFactory(metaclass=Singleton):
return self.secrets_manager
def _load_secrets_manager_credentials(self) -> Optional["AWSCredentials"]:
if not self.secrets_manager_loader:
return None

View File

@ -80,7 +80,6 @@ class ApplicationWorkflow(BaseWorkflow, ABC):
runner: Optional[AppRunner]
def __init__(self, config_dict: dict):
self.runner = None # Will be passed in post-init
# TODO: Create a parse_gracefully method
self.config = OpenMetadataApplicationConfig.parse_obj(config_dict)

View File

@ -235,7 +235,6 @@ class BaseWorkflow(ABC, WorkflowStatusMixin):
service = self._get_ingestion_pipeline_service()
if service is not None:
return self.metadata.create_or_update(
CreateIngestionPipelineRequest(
name=pipeline_name,

View File

@ -31,7 +31,6 @@ class MetadataWorkflow(IngestionWorkflow):
"""
def set_steps(self):
# We keep the source registered in the workflow
self.source = self._get_source()
sink = self._get_sink()

View File

@ -33,7 +33,6 @@ class UsageWorkflow(IngestionWorkflow):
"""
def set_steps(self):
# We keep the source registered in the workflow
self.source = self._get_source()
processor = self._get_processor()

View File

@ -20,7 +20,6 @@ from .common_e2e_sqa_mixins import SQACommonMethods
class HiveCliTest(CliCommonDB.TestSuite, SQACommonMethods):
prepare_e2e: List[str] = [
"DROP DATABASE IF EXISTS e2e_cli_tests CASCADE",
"CREATE DATABASE e2e_cli_tests",

View File

@ -20,7 +20,6 @@ from .common.test_cli_dashboard import CliCommonDashboard
class MetabaseCliTest(CliCommonDashboard.TestSuite):
# in case we want to do something before running the tests
def prepare(self) -> None:
redshift_file_path = str(

View File

@ -23,7 +23,6 @@ from .common_e2e_sqa_mixins import SQACommonMethods
class OracleCliTest(CliCommonDB.TestSuite, SQACommonMethods):
create_table_query: str = """
CREATE TABLE admin.admin_emp (
empno NUMBER(5) PRIMARY KEY,

View File

@ -20,7 +20,6 @@ from .common.test_cli_dashboard import CliCommonDashboard
class PowerBICliTest(CliCommonDashboard.TestSuite):
# in case we want to do something before running the tests
def prepare(self) -> None:
redshift_file_path = str(

View File

@ -20,7 +20,6 @@ from .common.test_cli_dashboard import CliCommonDashboard
class TableauCliTest(CliCommonDashboard.TestSuite):
# in case we want to do something before running the tests
def prepare(self) -> None:
redshift_file_path = str(

View File

@ -157,7 +157,6 @@ class TestAirflowLineageRuner(TestCase):
)
def test_lineage_runner(self):
with DAG("test_runner", start_date=datetime(2021, 1, 1)) as dag:
BashOperator(
task_id="print_date",

View File

@ -300,7 +300,6 @@ def get_create_test_case(
def get_test_dag(name: str) -> DAG:
"""Get a DAG with the tasks created in the CreatePipelineRequest"""
with DAG(name, start_date=datetime(2021, 1, 1)) as dag:
tasks = [
BashOperator(
task_id=task_id,

View File

@ -52,7 +52,6 @@ class TestSecretsManagerFactory(TestCase):
)
def test_invalid_config_secret_manager(self):
om_connection: OpenMetadataConnection = self.build_open_metadata_connection(
SecretsManagerProvider.db,
SecretsManagerClientLoader.noop,

View File

@ -40,7 +40,6 @@ class ColumnNameScannerTest(TestCase):
self.assertIsNone(ColumnNameScanner.scan("user_id"))
def test_column_names_sensitive(self):
# Bank
self.assertEqual(ColumnNameScanner.scan("bank_account"), EXPECTED_SENSITIVE)

View File

@ -62,7 +62,6 @@ class FakeConnection:
class PandasInterfaceTest(TestCase):
import pandas as pd
col_names = [

View File

@ -328,7 +328,6 @@ class ProfilerInterfaceTest(TestCase):
self.assertEqual(50, actual)
def test_table_config_casting(self):
expected = TableConfig(
profileSample=200,
profileSampleType=ProfileSampleType.PERCENTAGE,

View File

@ -18,7 +18,7 @@ from unittest import TestCase
from metadata.generated.schema.entity.data.table import DataType
from metadata.ingestion.source.database.column_type_parser import ColumnTypeParser
from metadata.utils.datalake.datalake_utils import fetch_col_types
from metadata.utils.datalake.datalake_utils import GenericDataFrameColumnParser
COLUMN_TYPE_PARSE = [
"array<string>",
@ -129,4 +129,6 @@ def test_check_datalake_type():
}
df = pd.read_csv("ingestion/tests/unit/test_column_type_parser.csv")
for column_name in df.columns.values.tolist():
assert assert_col_type_dict.get(column_name) == fetch_col_types(df, column_name)
assert assert_col_type_dict.get(
column_name
) == GenericDataFrameColumnParser.fetch_col_types(df, column_name)

View File

@ -622,7 +622,6 @@ class PGSpiderLineageUnitTests(TestCase):
connection=self.postgres.service_connection,
service_name=self.postgres.config.serviceName,
):
if isinstance(record, AddLineageRequest):
requests.append(record)
@ -661,7 +660,6 @@ class PGSpiderLineageUnitTests(TestCase):
connection=self.postgres.service_connection,
service_name=self.postgres.config.serviceName,
):
if isinstance(record, AddLineageRequest):
requests.append(record)
@ -700,7 +698,6 @@ class PGSpiderLineageUnitTests(TestCase):
connection=self.postgres.service_connection,
service_name=self.postgres.config.serviceName,
):
if isinstance(record, AddLineageRequest):
requests.append(record)
@ -738,7 +735,6 @@ class PGSpiderLineageUnitTests(TestCase):
connection=self.postgres.service_connection,
service_name=self.postgres.config.serviceName,
):
if isinstance(record, AddLineageRequest):
requests.append(record)
@ -773,7 +769,6 @@ class PGSpiderLineageUnitTests(TestCase):
connection=self.postgres.service_connection,
service_name=self.postgres.config.serviceName,
):
if isinstance(record, AddLineageRequest):
requests.append(record)
@ -809,7 +804,6 @@ class PGSpiderLineageUnitTests(TestCase):
connection=self.postgres.service_connection,
service_name=self.postgres.config.serviceName,
):
if isinstance(record, AddLineageRequest):
requests.append(record)

View File

@ -33,7 +33,7 @@ from metadata.generated.schema.type.entityReference import EntityReference
from metadata.ingestion.source.database.datalake.metadata import DatalakeSource
from metadata.readers.dataframe.avro import AvroDataFrameReader
from metadata.readers.dataframe.json import JSONDataFrameReader
from metadata.utils.datalake.datalake_utils import get_columns
from metadata.utils.datalake.datalake_utils import GenericDataFrameColumnParser
mock_datalake_config = {
"source": {
@ -459,13 +459,17 @@ class DatalakeUnitTest(TestCase):
actual_df_3 = JSONDataFrameReader.read_from_json(
key="file.json", json_text=EXAMPLE_JSON_TEST_3, decode=True
)[0]
actual_cols_3 = get_columns(actual_df_3)
actual_cols_3 = GenericDataFrameColumnParser._get_columns(
actual_df_3
) # pylint: disable=protected-access
assert actual_cols_3 == EXAMPLE_JSON_COL_3
actual_df_4 = JSONDataFrameReader.read_from_json(
key="file.json", json_text=EXAMPLE_JSON_TEST_4, decode=True
)[0]
actual_cols_4 = get_columns(actual_df_4)
actual_cols_4 = GenericDataFrameColumnParser._get_columns(
actual_df_4
) # pylint: disable=protected-access
assert actual_cols_4 == EXAMPLE_JSON_COL_4
def test_avro_file_parse(self):

View File

@ -641,7 +641,6 @@ class IcebergUnitTest(TestCase):
with patch.object(
HiveCatalog, "list_tables", return_value=MOCK_TABLE_LIST
), patch.object(HiveCatalog, "load_table", return_value=LoadTableMock()):
for i, table in enumerate(self.iceberg.get_tables_name_and_type()):
self.assertEqual(table, EXPECTED_TABLE_LIST[i])
@ -655,7 +654,6 @@ class IcebergUnitTest(TestCase):
), patch.object(
HiveCatalog, "load_table", side_effect=raise_no_such_iceberg_table
):
self.assertEqual(len(list(self.iceberg.get_tables_name_and_type())), 0)
# When pyiceberg.exceptions.NoSuchTableError is raised
@ -666,7 +664,6 @@ class IcebergUnitTest(TestCase):
with patch.object(
HiveCatalog, "list_tables", return_value=MOCK_TABLE_LIST
), patch.object(HiveCatalog, "load_table", side_effect=raise_no_such_table):
self.assertEqual(len(list(self.iceberg.get_tables_name_and_type())), 0)
def test_get_owner_ref(self):
@ -802,7 +799,6 @@ class IcebergUnitTest(TestCase):
with patch.object(
OpenMetadata, "get_reference_by_email", return_value=ref
), patch.object(fqn, "build", return_value=fq_database_schema):
result = next(self.iceberg.yield_table((table_name, table_type))).right
self.assertEqual(result, expected)

View File

@ -192,7 +192,6 @@ class AirbyteUnitTest(TestCase):
assert pipline == EXPECTED_CREATED_PIPELINES
def test_pipeline_status(self):
status = [
either.right
for either in self.airbyte.yield_pipeline_status(EXPECTED_ARIBYTE_DETAILS)

View File

@ -308,7 +308,6 @@ class StorageUnitTest(TestCase):
)
],
):
Column.__eq__ = custom_column_compare
self.assertListEqual(
[

View File

@ -12,12 +12,17 @@
Test datalake utils
"""
import os
from unittest import TestCase
from metadata.generated.schema.entity.data.table import Column
import pandas as pd
from metadata.generated.schema.entity.data.table import Column, DataType
from metadata.readers.dataframe.reader_factory import SupportedTypes
from metadata.utils.datalake.datalake_utils import (
construct_json_column_children,
unique_json_structure,
DataFrameColumnParser,
GenericDataFrameColumnParser,
ParquetDataFrameColumnParser,
)
STRUCTURE = {
@ -53,7 +58,7 @@ class TestDatalakeUtils(TestCase):
]
expected = STRUCTURE
actual = unique_json_structure(sample_data)
actual = GenericDataFrameColumnParser.unique_json_structure(sample_data)
self.assertDictEqual(expected, actual)
@ -153,14 +158,16 @@ class TestDatalakeUtils(TestCase):
],
},
]
actual = construct_json_column_children(STRUCTURE)
actual = GenericDataFrameColumnParser.construct_json_column_children(STRUCTURE)
for el in zip(expected, actual):
self.assertDictEqual(el[0], el[1])
def test_create_column_object(self):
"""test create column object fn"""
formatted_column = construct_json_column_children(STRUCTURE)
formatted_column = GenericDataFrameColumnParser.construct_json_column_children(
STRUCTURE
)
column = {
"dataTypeDisplay": "STRING",
"dataType": "STRING",
@ -170,3 +177,270 @@ class TestDatalakeUtils(TestCase):
}
column_obj = Column(**column)
assert len(column_obj.children) == 3
class TestParquetDataFrameColumnParser(TestCase):
"""Test parquet dataframe column parser"""
@classmethod
def setUpClass(cls) -> None:
resources_path = os.path.join(
os.path.dirname(os.path.dirname(__file__)), "resources"
)
cls.parquet_path = os.path.join(resources_path, "datalake", "example.parquet")
cls.df = pd.read_parquet(cls.parquet_path)
cls.parquet_parser = ParquetDataFrameColumnParser(cls.df)
def test_parser_instantiation(self):
"""Test the right parser is instantiated from the creator method"""
parquet_parser = DataFrameColumnParser.create(self.df, SupportedTypes.PARQUET)
self.assertIsInstance(parquet_parser.parser, ParquetDataFrameColumnParser)
other_types = [typ for typ in SupportedTypes if typ != SupportedTypes.PARQUET]
for other_type in other_types:
with self.subTest(other_type=other_type):
generic_parser = DataFrameColumnParser.create(self.df, other_type)
self.assertIsInstance(
generic_parser.parser, GenericDataFrameColumnParser
)
def test_shuffle_and_sample_from_parser(self):
"""test the shuffle and sampling logic from the parser creator method"""
parquet_parser = DataFrameColumnParser.create(self.df, SupportedTypes.PARQUET)
self.assertEqual(parquet_parser.parser.data_frame.shape, self.df.shape)
parquet_parser = DataFrameColumnParser.create(
[self.df, self.df], SupportedTypes.PARQUET
)
self.assertEqual(parquet_parser.parser.data_frame.shape, self.df.shape)
parquet_parser = DataFrameColumnParser.create(
[self.df, self.df], SupportedTypes.PARQUET, sample=False
)
self.assertEqual(
parquet_parser.parser.data_frame.shape, pd.concat([self.df, self.df]).shape
)
def test_get_columns(self):
"""test `get_columns` method of the parquet column parser"""
expected = [
Column(
dataTypeDisplay="bool",
dataType=DataType.BOOLEAN,
name="a",
displayName="a",
), # type: ignore
Column(
dataTypeDisplay="int8",
dataType=DataType.INT,
name="b",
displayName="b",
), # type: ignore
Column(
dataTypeDisplay="int16",
dataType=DataType.INT,
name="c",
displayName="c",
), # type: ignore
Column(
dataTypeDisplay="int32",
dataType=DataType.INT,
name="d",
displayName="d",
), # type: ignore
Column(
dataTypeDisplay="int64",
dataType=DataType.INT,
name="e",
displayName="e",
), # type: ignore
Column(
dataTypeDisplay="uint8",
dataType=DataType.UINT,
name="f",
displayName="f",
), # type: ignore
Column(
dataTypeDisplay="uint16",
dataType=DataType.UINT,
name="g",
displayName="g",
), # type: ignore
Column(
dataTypeDisplay="uint32",
dataType=DataType.UINT,
name="h",
displayName="h",
), # type: ignore
Column(
dataTypeDisplay="uint64",
dataType=DataType.UINT,
name="i",
displayName="i",
), # type: ignore
Column(
dataTypeDisplay="float",
dataType=DataType.FLOAT,
name="k",
displayName="k",
), # type: ignore
Column(
dataTypeDisplay="double",
dataType=DataType.FLOAT,
name="l",
displayName="l",
), # type: ignore
Column(
dataTypeDisplay="time64[us]",
dataType=DataType.DATETIME,
name="n",
displayName="n",
), # type: ignore
Column(
dataTypeDisplay="timestamp[ns]",
dataType=DataType.DATETIME,
name="o",
displayName="o",
), # type: ignore
Column(
dataTypeDisplay="date32[day]",
dataType=DataType.DATE,
name="p",
displayName="p",
), # type: ignore
Column(
dataTypeDisplay="date32[day]",
dataType=DataType.DATE,
name="q",
displayName="q",
), # type: ignore
Column(
dataTypeDisplay="duration[ns]",
dataType=DataType.INT,
name="r",
displayName="r",
), # type: ignore
Column(
dataTypeDisplay="binary",
dataType=DataType.BINARY,
name="t",
displayName="t",
), # type: ignore
Column(
dataTypeDisplay="string",
dataType=DataType.STRING,
name="u",
displayName="u",
), # type: ignore
Column(
dataTypeDisplay="string",
dataType=DataType.STRING,
name="v",
displayName="v",
), # type: ignore
Column(
dataTypeDisplay="binary",
dataType=DataType.BINARY,
name="w",
displayName="w",
), # type: ignore
Column(
dataTypeDisplay="string",
dataType=DataType.STRING,
name="x",
displayName="x",
), # type: ignore
Column(
dataTypeDisplay="string",
dataType=DataType.STRING,
name="y",
displayName="y",
), # type: ignore
Column(
dataTypeDisplay="list<item: int64>",
dataType=DataType.ARRAY,
name="aa",
displayName="aa",
), # type: ignore
Column(
dataTypeDisplay="list<item: int64>",
dataType=DataType.ARRAY,
name="bb",
displayName="bb",
), # type: ignore
Column(
dataTypeDisplay="struct<ee: int64, ff: int64, gg: struct<hh: struct<ii: int64, jj: int64, kk: int64>>>",
dataType=DataType.STRUCT,
name="dd",
displayName="dd",
children=[
Column(
dataTypeDisplay="int64",
dataType=DataType.INT,
name="ee",
displayName="ee",
), # type: ignore
Column(
dataTypeDisplay="int64",
dataType=DataType.INT,
name="ff",
displayName="ff",
), # type: ignore
Column(
dataTypeDisplay="struct<hh: struct<ii: int64, jj: int64, kk: int64>>",
dataType=DataType.STRUCT,
name="gg",
displayName="gg",
children=[
Column(
dataTypeDisplay="struct<ii: int64, jj: int64, kk: int64>",
dataType=DataType.STRUCT,
name="hh",
displayName="hh",
children=[
Column(
dataTypeDisplay="int64",
dataType=DataType.INT,
name="ii",
displayName="ii",
), # type: ignore
Column(
dataTypeDisplay="int64",
dataType=DataType.INT,
name="jj",
displayName="jj",
), # type: ignore
Column(
dataTypeDisplay="int64",
dataType=DataType.INT,
name="kk",
displayName="kk",
), # type: ignore
],
),
],
),
],
), # type: ignore
]
actual = self.parquet_parser.get_columns()
for validation in zip(expected, actual):
with self.subTest(validation=validation):
expected_col, actual_col = validation
self.assertEqual(expected_col.name, actual_col.name)
self.assertEqual(expected_col.displayName, actual_col.displayName)
self.assertEqual(expected_col.dataType, actual_col.dataType)
def _validate_parsed_column(self, expected, actual):
"""validate parsed column"""
self.assertEqual(expected.name, actual.name)
self.assertEqual(expected.dataType, actual.dataType)
self.assertEqual(expected.displayName, actual.displayName)
if expected.children:
self.assertEqual(len(expected.children), len(actual.children))
for validation in zip(expected.children, actual.children):
with self.subTest(validation=validation):
expected_col, actual_col = validation
self._validate_parsed_column(expected_col, actual_col)

View File

@ -110,7 +110,6 @@ def get_dagbag():
class ScanDagsTask(Process):
def run(self):
if airflow_version >= "2.6":
scheduler_job = self._run_new_scheduler_job()
else:

View File

@ -149,7 +149,8 @@
"LARGEINT",
"QUANTILE_STATE",
"AGG_STATE",
"BITMAP"
"BITMAP",
"UINT"
]
},
"constraint": {