mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-09-25 17:04:54 +00:00
* Fix #1843: Add Delta Lake Connector * Fix #1843: Add Delta Lake Connector
This commit is contained in:
parent
917ccd7147
commit
0e736012a9
186
ingestion/src/metadata/ingestion/source/deltalake.py
Normal file
186
ingestion/src/metadata/ingestion/source/deltalake.py
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
import logging
|
||||||
|
import uuid
|
||||||
|
from collections import Iterable
|
||||||
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
|
from pyspark.sql import SparkSession
|
||||||
|
from pyspark.sql.catalog import Table
|
||||||
|
from pyspark.sql.types import ArrayType, MapType, StructField, StructType
|
||||||
|
from pyspark.sql.utils import AnalysisException, ParseException
|
||||||
|
|
||||||
|
from metadata.config.common import ConfigModel
|
||||||
|
from metadata.generated.schema.entity.data.database import Database
|
||||||
|
from metadata.generated.schema.entity.data.table import Column, Table
|
||||||
|
from metadata.generated.schema.type.entityReference import EntityReference
|
||||||
|
from metadata.ingestion.api.common import IncludeFilterPattern, WorkflowContext
|
||||||
|
from metadata.ingestion.api.source import Source
|
||||||
|
from metadata.ingestion.models.ometa_table_db import OMetaDatabaseAndTable
|
||||||
|
from metadata.ingestion.ometa.openmetadata_rest import MetadataServerConfig
|
||||||
|
from metadata.ingestion.source.sql_source import SQLSourceStatus
|
||||||
|
from metadata.utils.helpers import get_database_service_or_create
|
||||||
|
|
||||||
|
logger: logging.Logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DeltaLakeSourceConfig(ConfigModel):
|
||||||
|
database: str = "delta"
|
||||||
|
platform_name: str = "deltalake"
|
||||||
|
schema_filter_pattern: IncludeFilterPattern = IncludeFilterPattern.allow_all()
|
||||||
|
table_filter_pattern: IncludeFilterPattern = IncludeFilterPattern.allow_all()
|
||||||
|
service_name: str
|
||||||
|
service_type: str
|
||||||
|
|
||||||
|
def get_service_name(self) -> str:
|
||||||
|
return self.service_name
|
||||||
|
|
||||||
|
|
||||||
|
class DeltaLakeSource(Source):
|
||||||
|
spark: SparkSession = None
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
config: DeltaLakeSourceConfig,
|
||||||
|
metadata_config: MetadataServerConfig,
|
||||||
|
ctx: WorkflowContext,
|
||||||
|
):
|
||||||
|
super().__init__(ctx)
|
||||||
|
self.config = config
|
||||||
|
self.metadata_config = metadata_config
|
||||||
|
self.service = get_database_service_or_create(config, metadata_config)
|
||||||
|
self.status = SQLSourceStatus()
|
||||||
|
# spark session needs to initiated outside the workflow and pass it through WorkflowContext
|
||||||
|
self.spark = ctx.spark
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def create(
|
||||||
|
cls, config_dict: dict, metadata_config_dict: dict, ctx: WorkflowContext
|
||||||
|
):
|
||||||
|
config = DeltaLakeSourceConfig.parse_obj(config_dict)
|
||||||
|
metadata_config = MetadataServerConfig.parse_obj(metadata_config_dict)
|
||||||
|
return cls(config, metadata_config, ctx)
|
||||||
|
|
||||||
|
def next_record(self) -> Iterable[OMetaDatabaseAndTable]:
|
||||||
|
schemas = self.spark.catalog.listDatabases()
|
||||||
|
for schema in schemas:
|
||||||
|
if not self.config.schema_filter_pattern.included(schema):
|
||||||
|
self.status.filter(schema, "Schema pattern not allowed")
|
||||||
|
continue
|
||||||
|
yield from self.fetch_tables(schema)
|
||||||
|
|
||||||
|
def fetch_tables(self, schema: str) -> Iterable[OMetaDatabaseAndTable]:
|
||||||
|
for table in self.spark.catalog.listTables(schema):
|
||||||
|
try:
|
||||||
|
database = table.database
|
||||||
|
table_name = table.name
|
||||||
|
if not self.config.table_filter_pattern.included(table_name):
|
||||||
|
self.status.filter(
|
||||||
|
"{}.{}".format(self.config.get_service_name(), table_name),
|
||||||
|
"Table pattern not allowed",
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
self.status.scanned(
|
||||||
|
"{}.{}".format(self.config.get_service_name(), table_name)
|
||||||
|
)
|
||||||
|
table_columns = self._fetch_columns(schema, table_name)
|
||||||
|
fqn = f"{self.config.service_name}.{self.config.database}.{schema}.{table_name}"
|
||||||
|
if table.tableType and table.tableType.lower() != "view":
|
||||||
|
table_description = self._fetch_table_description(table_name)
|
||||||
|
table_entity = Table(
|
||||||
|
id=uuid.uuid4(),
|
||||||
|
name=table_name,
|
||||||
|
tableType=table.tableType,
|
||||||
|
description=" ",
|
||||||
|
fullyQualifiedName=fqn,
|
||||||
|
columns=table_columns,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
view_definition = self._fetch_view_schema(table_name)
|
||||||
|
table_entity = Table(
|
||||||
|
id=uuid.uuid4(),
|
||||||
|
name=table_name,
|
||||||
|
tableType=table.tableType,
|
||||||
|
description=" ",
|
||||||
|
fullyQualifiedName=fqn,
|
||||||
|
columns=table_columns,
|
||||||
|
viewDefinition=view_definition,
|
||||||
|
)
|
||||||
|
|
||||||
|
table_and_db = OMetaDatabaseAndTable(
|
||||||
|
table=table_entity, database=self._get_database(schema)
|
||||||
|
)
|
||||||
|
yield table_and_db
|
||||||
|
except Exception as err:
|
||||||
|
logger.error(err)
|
||||||
|
self.status.warnings.append(
|
||||||
|
"{}.{}".format(self.config.service_name, table.name)
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_database(self, schema: str) -> Database:
|
||||||
|
return Database(
|
||||||
|
name=schema,
|
||||||
|
service=EntityReference(id=self.service.id, type=self.config.service_type),
|
||||||
|
)
|
||||||
|
|
||||||
|
def _fetch_table_description(self, table_name: str) -> Optional[Dict]:
|
||||||
|
try:
|
||||||
|
table_details_df = self.spark.sql(f"describe detail {table_name}")
|
||||||
|
table_detail = table_details_df.collect()[0]
|
||||||
|
return table_detail.asDict()
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(e)
|
||||||
|
|
||||||
|
def _fetch_view_schema(self, view_name: str) -> Optional[Dict]:
|
||||||
|
describe_output = []
|
||||||
|
try:
|
||||||
|
describe_output = self.spark.sql(f"describe extended {view_name}").collect()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(e)
|
||||||
|
return None
|
||||||
|
view_detail = {}
|
||||||
|
col_details = False
|
||||||
|
|
||||||
|
for row in describe_output:
|
||||||
|
row_dict = row.asDict()
|
||||||
|
if col_details:
|
||||||
|
view_detail[row_dict["col_name"]] = row_dict["data_type"]
|
||||||
|
if "# Detailed Table" in row_dict["col_name"]:
|
||||||
|
col_details = True
|
||||||
|
return view_detail
|
||||||
|
|
||||||
|
def _fetch_columns(self, schema: str, table: str) -> List[Column]:
|
||||||
|
raw_columns = []
|
||||||
|
field_dict: Dict[str, Any] = {}
|
||||||
|
table_name = f"{schema}.{table}"
|
||||||
|
try:
|
||||||
|
raw_columns = self.spark.sql(f"describe {table_name}").collect()
|
||||||
|
for field in self.spark.table(f"{table_name}").schema:
|
||||||
|
field_dict[field.name] = field
|
||||||
|
except (AnalysisException, ParseException) as e:
|
||||||
|
logger.error(e)
|
||||||
|
return []
|
||||||
|
parsed_columns: [Column] = []
|
||||||
|
partition_cols = False
|
||||||
|
row_order = 0
|
||||||
|
for row in raw_columns:
|
||||||
|
col_name = row["col_name"]
|
||||||
|
if col_name == "" or "#" in col_name:
|
||||||
|
partition_cols = True
|
||||||
|
continue
|
||||||
|
if not partition_cols:
|
||||||
|
column = Column(
|
||||||
|
name=row["col_name"],
|
||||||
|
description=row["comment"] if row["comment"] else None,
|
||||||
|
data_type=row["data_type"],
|
||||||
|
ordinal_position=row_order,
|
||||||
|
)
|
||||||
|
parsed_columns.append(column)
|
||||||
|
row_order += 1
|
||||||
|
|
||||||
|
return parsed_columns
|
||||||
|
|
||||||
|
def _is_complex_delta_type(self, delta_type: Any) -> bool:
|
||||||
|
return (
|
||||||
|
isinstance(delta_type, StructType)
|
||||||
|
or isinstance(delta_type, ArrayType)
|
||||||
|
or isinstance(delta_type, MapType)
|
||||||
|
)
|
Loading…
x
Reference in New Issue
Block a user