Keshav Mohta 103857f90c
Fixes #23010 #: BigQuery Project Selection In Profiler & AutoClassification Workflow (#23233)
* fix: added code for separate engine and session for each project in rofiler and classification and refactor billing project approach

* fix: added entity.database check, bigquery sampling tests

* fix: system metrics logic when bigquery billing project is provided
2025-09-05 14:09:14 +05:30

108 lines
4.1 KiB
Python

# Copyright 2025 Collate
# Licensed under the Collate Community License, Version 1.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Interfaces with database for all database engine
supporting sqlalchemy abstraction layer
"""
from copy import deepcopy
from typing import List, Type, cast
from sqlalchemy import Column, inspect
from metadata.generated.schema.entity.data.table import SystemProfile
from metadata.generated.schema.security.credentials.gcpValues import SingleProjectId
from metadata.profiler.interface.sqlalchemy.profiler_interface import (
SQAProfilerInterface,
)
from metadata.profiler.metrics.system.bigquery.system import (
BigQuerySystemMetricsComputer,
)
from metadata.profiler.metrics.system.system import System
from metadata.profiler.processor.runner import QueryRunner
from metadata.utils.logger import profiler_interface_registry_logger
from metadata.utils.ssl_manager import get_ssl_connection
logger = profiler_interface_registry_logger()
class BigQueryProfilerInterface(SQAProfilerInterface):
"""BigQuery profiler interface"""
def create_session(self):
connection_config = deepcopy(self.service_connection_config)
# Create a modified connection for BigQuery with the correct project ID
if (
hasattr(connection_config.credentials.gcpConfig, "projectId")
and self.table_entity.database
):
connection_config.credentials.gcpConfig.projectId = SingleProjectId(
root=self.table_entity.database.name
)
self.connection = get_ssl_connection(connection_config)
return super().create_session()
def _compute_system_metrics(
self,
metrics: Type[System],
runner: QueryRunner,
*args,
**kwargs,
) -> List[SystemProfile]:
logger.debug(f"Computing {metrics.name()} metric for {runner.table_name}")
self.system_metrics_class = cast(
Type[BigQuerySystemMetricsComputer], self.system_metrics_class
)
instance = self.system_metrics_class(
session=self.session,
runner=runner,
usage_location=self.service_connection_config.usageLocation,
billing_project_id=self.service_connection_config.billingProjectId,
)
return instance.get_system_metrics()
def _get_struct_columns(self, columns: dict, parent: str):
""""""
# pylint: disable=import-outside-toplevel
from sqlalchemy_bigquery import STRUCT
columns_list = []
for key, value in columns:
if not isinstance(value, STRUCT):
col = Column(f"{parent}.{key}", value)
# pylint: disable=protected-access
col._set_parent(self.table.__table__)
# pylint: enable=protected-access
columns_list.append(col)
else:
col = self._get_struct_columns(
value.__dict__.get("_STRUCT_fields"), f"{parent}.{key}"
)
columns_list.extend(col)
return columns_list
def get_columns(self) -> Column:
"""Get columns from table"""
# pylint: disable=import-outside-toplevel
from sqlalchemy_bigquery import STRUCT
columns = []
for column in inspect(self.table).c:
if isinstance(column.type, STRUCT):
columns.extend(
self._get_struct_columns(
column.type.__dict__.get("_STRUCT_fields"), column.name
)
)
else:
columns.append(column)
return columns