mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-09-03 14:13:06 +00:00
115 lines
4.5 KiB
Python
115 lines
4.5 KiB
Python
# Copyright 2021 Collate
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
"""
|
|
Interfaces with database for all database engine
|
|
supporting sqlalchemy abstraction layer
|
|
"""
|
|
from typing import List
|
|
|
|
from pyhive.sqlalchemy_hive import HiveCompiler
|
|
from sqlalchemy import Column, inspect
|
|
|
|
from metadata.generated.schema.entity.data.table import Column as OMColumn
|
|
from metadata.generated.schema.entity.data.table import ColumnName, DataType, TableData
|
|
from metadata.generated.schema.entity.services.databaseService import (
|
|
DatabaseServiceType,
|
|
)
|
|
from metadata.profiler.interface.sqlalchemy.profiler_interface import (
|
|
SQAProfilerInterface,
|
|
)
|
|
from metadata.profiler.orm.converter.base import build_orm_col
|
|
|
|
|
|
class DatabricksProfilerInterface(SQAProfilerInterface):
|
|
"""Databricks profiler interface"""
|
|
|
|
def visit_column(self, *args, **kwargs):
|
|
result = super( # pylint: disable=bad-super-call
|
|
HiveCompiler, self
|
|
).visit_column(*args, **kwargs)
|
|
# Here the databricks uses HiveCompiler.
|
|
# the `result` here would be `db.schema.table` or `db.schema.table.column`
|
|
# for struct it will be `db.schema.table.column.nestedchild.nestedchild` etc
|
|
# the logic is to add the backticks to nested children.
|
|
dot_count = result.count(".")
|
|
if dot_count > 1 and "." in result.split("`.`")[-1]:
|
|
splitted_result = result.split("`.")[-1].split(".")
|
|
result = "`.".join(result.split("`.")[:-1])
|
|
if result:
|
|
result += "`."
|
|
result += "`.`".join(splitted_result)
|
|
return result
|
|
|
|
def __init__(self, service_connection_config, **kwargs):
|
|
super().__init__(service_connection_config=service_connection_config, **kwargs)
|
|
self.set_catalog(self.session)
|
|
HiveCompiler.visit_column = DatabricksProfilerInterface.visit_column
|
|
|
|
def _get_struct_columns(self, columns: List[OMColumn], parent: str):
|
|
"""Get struct columns"""
|
|
|
|
columns_list = []
|
|
for col in columns:
|
|
if col.dataType != DataType.STRUCT:
|
|
# For DBX struct we need to quote the column name as `a`.`b`.`c`
|
|
# otherwise the driver will quote it as `a.b.c`
|
|
col_name = ".".join([f"`{part}`" for part in parent.split(".")])
|
|
col.name = ColumnName(f"{col_name}.`{col.name.root}`")
|
|
# Set `_quote` to False to avoid quoting the column name again when compiled
|
|
sqa_col = build_orm_col(
|
|
idx=1,
|
|
col=col,
|
|
table_service_type=DatabaseServiceType.Databricks,
|
|
_quote=False,
|
|
)
|
|
sqa_col._set_parent( # pylint: disable=protected-access
|
|
self.table.__table__
|
|
)
|
|
columns_list.append(sqa_col)
|
|
else:
|
|
cols = self._get_struct_columns(
|
|
col.children, f"{parent}.{col.name.root}"
|
|
)
|
|
columns_list.extend(cols)
|
|
return columns_list
|
|
|
|
def get_columns(self) -> Column:
|
|
"""Get columns from table"""
|
|
columns = []
|
|
for idx, column_obj in enumerate(self.table_entity.columns):
|
|
if column_obj.dataType == DataType.STRUCT:
|
|
columns.extend(
|
|
self._get_struct_columns(column_obj.children, column_obj.name.root)
|
|
)
|
|
else:
|
|
col = build_orm_col(idx, column_obj, DatabaseServiceType.Databricks)
|
|
col._set_parent( # pylint: disable=protected-access
|
|
self.table.__table__
|
|
)
|
|
columns.append(col)
|
|
return columns
|
|
|
|
def fetch_sample_data(self, table, columns) -> TableData:
|
|
"""Fetch sample data from database
|
|
|
|
Args:
|
|
table: ORM declarative table
|
|
|
|
Returns:
|
|
TableData: sample table data
|
|
"""
|
|
sampler = self._get_sampler(
|
|
table=table,
|
|
)
|
|
|
|
return sampler.fetch_sample_data(list(inspect(self.table).c))
|