mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-09-30 11:26:23 +00:00
* fix: added code for separate engine and session for each project in rofiler and classification and refactor billing project approach * fix: added entity.database check, bigquery sampling tests * fix: system metrics logic when bigquery billing project is provided
This commit is contained in:
parent
972afc375a
commit
103857f90c
@ -56,7 +56,10 @@ def get_connection_args_common(connection) -> Dict[str, Any]:
|
|||||||
|
|
||||||
|
|
||||||
def create_generic_db_connection(
|
def create_generic_db_connection(
|
||||||
connection, get_connection_url_fn: Callable, get_connection_args_fn: Callable
|
connection,
|
||||||
|
get_connection_url_fn: Callable,
|
||||||
|
get_connection_args_fn: Callable,
|
||||||
|
**kwargs,
|
||||||
) -> Engine:
|
) -> Engine:
|
||||||
"""
|
"""
|
||||||
Generic Engine creation from connection object
|
Generic Engine creation from connection object
|
||||||
@ -75,6 +78,7 @@ def create_generic_db_connection(
|
|||||||
pool_reset_on_return=None, # https://docs.sqlalchemy.org/en/14/core/pooling.html#reset-on-return
|
pool_reset_on_return=None, # https://docs.sqlalchemy.org/en/14/core/pooling.html#reset-on-return
|
||||||
echo=False,
|
echo=False,
|
||||||
max_overflow=-1,
|
max_overflow=-1,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
attach_query_tracker(engine)
|
attach_query_tracker(engine)
|
||||||
|
@ -31,6 +31,7 @@ from metadata.generated.schema.entity.services.connections.testConnectionResult
|
|||||||
TestConnectionResult,
|
TestConnectionResult,
|
||||||
)
|
)
|
||||||
from metadata.generated.schema.security.credentials.gcpCredentials import (
|
from metadata.generated.schema.security.credentials.gcpCredentials import (
|
||||||
|
GcpADC,
|
||||||
GcpCredentialsPath,
|
GcpCredentialsPath,
|
||||||
)
|
)
|
||||||
from metadata.generated.schema.security.credentials.gcpValues import (
|
from metadata.generated.schema.security.credentials.gcpValues import (
|
||||||
@ -68,25 +69,21 @@ def get_connection_url(connection: BigQueryConnection) -> str:
|
|||||||
connection.credentials.gcpConfig.projectId, SingleProjectId
|
connection.credentials.gcpConfig.projectId, SingleProjectId
|
||||||
):
|
):
|
||||||
if not connection.credentials.gcpConfig.projectId.root:
|
if not connection.credentials.gcpConfig.projectId.root:
|
||||||
return f"{connection.scheme.value}://{connection.billingProjectId or connection.credentials.gcpConfig.projectId.root or ''}"
|
return f"{connection.scheme.value}://{connection.credentials.gcpConfig.projectId.root or ''}"
|
||||||
if (
|
if (
|
||||||
not connection.credentials.gcpConfig.privateKey
|
not connection.credentials.gcpConfig.privateKey
|
||||||
and connection.credentials.gcpConfig.projectId.root
|
and connection.credentials.gcpConfig.projectId.root
|
||||||
):
|
):
|
||||||
project_id = connection.credentials.gcpConfig.projectId.root
|
project_id = connection.credentials.gcpConfig.projectId.root
|
||||||
os.environ["GOOGLE_CLOUD_PROJECT"] = (
|
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
|
||||||
connection.billingProjectId or project_id
|
return f"{connection.scheme.value}://{connection.credentials.gcpConfig.projectId.root}"
|
||||||
)
|
|
||||||
return f"{connection.scheme.value}://{connection.billingProjectId or connection.credentials.gcpConfig.projectId.root}"
|
|
||||||
elif isinstance(connection.credentials.gcpConfig.projectId, MultipleProjectId):
|
elif isinstance(connection.credentials.gcpConfig.projectId, MultipleProjectId):
|
||||||
for project_id in connection.credentials.gcpConfig.projectId.root:
|
for project_id in connection.credentials.gcpConfig.projectId.root:
|
||||||
if not connection.credentials.gcpConfig.privateKey and project_id:
|
if not connection.credentials.gcpConfig.privateKey and project_id:
|
||||||
# Setting environment variable based on project id given by user / set in ADC
|
# Setting environment variable based on project id given by user / set in ADC
|
||||||
os.environ["GOOGLE_CLOUD_PROJECT"] = (
|
os.environ["GOOGLE_CLOUD_PROJECT"] = project_id
|
||||||
connection.billingProjectId or project_id
|
return f"{connection.scheme.value}://{project_id}"
|
||||||
)
|
return f"{connection.scheme.value}://"
|
||||||
return f"{connection.scheme.value}://{connection.billingProjectId or project_id}"
|
|
||||||
return f"{connection.scheme.value}://{connection.billingProjectId or ''}"
|
|
||||||
|
|
||||||
# If gcpConfig is the JSON key path and projectId is defined, we use it by default
|
# If gcpConfig is the JSON key path and projectId is defined, we use it by default
|
||||||
elif (
|
elif (
|
||||||
@ -96,13 +93,27 @@ def get_connection_url(connection: BigQueryConnection) -> str:
|
|||||||
if isinstance( # pylint: disable=no-else-return
|
if isinstance( # pylint: disable=no-else-return
|
||||||
connection.credentials.gcpConfig.projectId, SingleProjectId
|
connection.credentials.gcpConfig.projectId, SingleProjectId
|
||||||
):
|
):
|
||||||
return f"{connection.scheme.value}://{connection.billingProjectId or connection.credentials.gcpConfig.projectId.root}"
|
return f"{connection.scheme.value}://{connection.credentials.gcpConfig.projectId.root}"
|
||||||
|
|
||||||
elif isinstance(connection.credentials.gcpConfig.projectId, MultipleProjectId):
|
elif isinstance(connection.credentials.gcpConfig.projectId, MultipleProjectId):
|
||||||
for project_id in connection.credentials.gcpConfig.projectId.root:
|
for project_id in connection.credentials.gcpConfig.projectId.root:
|
||||||
return f"{connection.scheme.value}://{connection.billingProjectId or project_id}"
|
return f"{connection.scheme.value}://{project_id}"
|
||||||
|
|
||||||
return f"{connection.scheme.value}://{connection.billingProjectId or ''}"
|
# If gcpConfig is the GCP ADC and projectId is defined, we use it by default
|
||||||
|
elif (
|
||||||
|
isinstance(connection.credentials.gcpConfig, GcpADC)
|
||||||
|
and connection.credentials.gcpConfig.projectId
|
||||||
|
):
|
||||||
|
if isinstance( # pylint: disable=no-else-return
|
||||||
|
connection.credentials.gcpConfig.projectId, SingleProjectId
|
||||||
|
):
|
||||||
|
return f"{connection.scheme.value}://{connection.credentials.gcpConfig.projectId.root}"
|
||||||
|
|
||||||
|
elif isinstance(connection.credentials.gcpConfig.projectId, MultipleProjectId):
|
||||||
|
for project_id in connection.credentials.gcpConfig.projectId.root:
|
||||||
|
return f"{connection.scheme.value}://{project_id}"
|
||||||
|
|
||||||
|
return f"{connection.scheme.value}://"
|
||||||
|
|
||||||
|
|
||||||
def get_connection(connection: BigQueryConnection) -> Engine:
|
def get_connection(connection: BigQueryConnection) -> Engine:
|
||||||
@ -110,10 +121,15 @@ def get_connection(connection: BigQueryConnection) -> Engine:
|
|||||||
Prepare the engine and the GCP credentials
|
Prepare the engine and the GCP credentials
|
||||||
"""
|
"""
|
||||||
set_google_credentials(gcp_credentials=connection.credentials)
|
set_google_credentials(gcp_credentials=connection.credentials)
|
||||||
|
kwargs = {}
|
||||||
|
if connection.billingProjectId:
|
||||||
|
kwargs["billing_project_id"] = connection.billingProjectId
|
||||||
|
|
||||||
return create_generic_db_connection(
|
return create_generic_db_connection(
|
||||||
connection=connection,
|
connection=connection,
|
||||||
get_connection_url_fn=get_connection_url,
|
get_connection_url_fn=get_connection_url,
|
||||||
get_connection_args_fn=get_connection_args_common,
|
get_connection_args_fn=get_connection_args_common,
|
||||||
|
**kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -211,13 +211,18 @@ class BigQueryQueryResult(BaseModel):
|
|||||||
usage_location: str,
|
usage_location: str,
|
||||||
dataset_id: str,
|
dataset_id: str,
|
||||||
project_id: str,
|
project_id: str,
|
||||||
|
billing_project_id: Optional[str] = None,
|
||||||
):
|
):
|
||||||
|
# Use billing project for the INFORMATION_SCHEMA query if provided
|
||||||
|
query_project_id = billing_project_id or project_id
|
||||||
|
|
||||||
rows = session.execute(
|
rows = session.execute(
|
||||||
text(
|
text(
|
||||||
JOBS.format(
|
JOBS.format(
|
||||||
usage_location=usage_location,
|
usage_location=usage_location,
|
||||||
dataset_id=dataset_id,
|
dataset_id=dataset_id,
|
||||||
project_id=project_id,
|
project_id=project_id,
|
||||||
|
query_project_id=query_project_id,
|
||||||
insert=DatabaseDMLOperations.INSERT.value,
|
insert=DatabaseDMLOperations.INSERT.value,
|
||||||
update=DatabaseDMLOperations.UPDATE.value,
|
update=DatabaseDMLOperations.UPDATE.value,
|
||||||
delete=DatabaseDMLOperations.DELETE.value,
|
delete=DatabaseDMLOperations.DELETE.value,
|
||||||
@ -240,7 +245,7 @@ JOBS = """
|
|||||||
dml_statistics.deleted_row_count as deleted_row_count,
|
dml_statistics.deleted_row_count as deleted_row_count,
|
||||||
dml_statistics.updated_row_count as updated_row_count
|
dml_statistics.updated_row_count as updated_row_count
|
||||||
FROM
|
FROM
|
||||||
`region-{usage_location}`.INFORMATION_SCHEMA.JOBS
|
`{query_project_id}`.`region-{usage_location}`.INFORMATION_SCHEMA.JOBS
|
||||||
WHERE
|
WHERE
|
||||||
DATE(creation_time) >= CURRENT_DATE() - 1 AND
|
DATE(creation_time) >= CURRENT_DATE() - 1 AND
|
||||||
destination_table.dataset_id = '{dataset_id}' AND
|
destination_table.dataset_id = '{dataset_id}' AND
|
||||||
|
@ -13,11 +13,13 @@
|
|||||||
Interfaces with database for all database engine
|
Interfaces with database for all database engine
|
||||||
supporting sqlalchemy abstraction layer
|
supporting sqlalchemy abstraction layer
|
||||||
"""
|
"""
|
||||||
|
from copy import deepcopy
|
||||||
from typing import List, Type, cast
|
from typing import List, Type, cast
|
||||||
|
|
||||||
from sqlalchemy import Column, inspect
|
from sqlalchemy import Column, inspect
|
||||||
|
|
||||||
from metadata.generated.schema.entity.data.table import SystemProfile
|
from metadata.generated.schema.entity.data.table import SystemProfile
|
||||||
|
from metadata.generated.schema.security.credentials.gcpValues import SingleProjectId
|
||||||
from metadata.profiler.interface.sqlalchemy.profiler_interface import (
|
from metadata.profiler.interface.sqlalchemy.profiler_interface import (
|
||||||
SQAProfilerInterface,
|
SQAProfilerInterface,
|
||||||
)
|
)
|
||||||
@ -27,6 +29,7 @@ from metadata.profiler.metrics.system.bigquery.system import (
|
|||||||
from metadata.profiler.metrics.system.system import System
|
from metadata.profiler.metrics.system.system import System
|
||||||
from metadata.profiler.processor.runner import QueryRunner
|
from metadata.profiler.processor.runner import QueryRunner
|
||||||
from metadata.utils.logger import profiler_interface_registry_logger
|
from metadata.utils.logger import profiler_interface_registry_logger
|
||||||
|
from metadata.utils.ssl_manager import get_ssl_connection
|
||||||
|
|
||||||
logger = profiler_interface_registry_logger()
|
logger = profiler_interface_registry_logger()
|
||||||
|
|
||||||
@ -34,6 +37,19 @@ logger = profiler_interface_registry_logger()
|
|||||||
class BigQueryProfilerInterface(SQAProfilerInterface):
|
class BigQueryProfilerInterface(SQAProfilerInterface):
|
||||||
"""BigQuery profiler interface"""
|
"""BigQuery profiler interface"""
|
||||||
|
|
||||||
|
def create_session(self):
|
||||||
|
connection_config = deepcopy(self.service_connection_config)
|
||||||
|
# Create a modified connection for BigQuery with the correct project ID
|
||||||
|
if (
|
||||||
|
hasattr(connection_config.credentials.gcpConfig, "projectId")
|
||||||
|
and self.table_entity.database
|
||||||
|
):
|
||||||
|
connection_config.credentials.gcpConfig.projectId = SingleProjectId(
|
||||||
|
root=self.table_entity.database.name
|
||||||
|
)
|
||||||
|
self.connection = get_ssl_connection(connection_config)
|
||||||
|
return super().create_session()
|
||||||
|
|
||||||
def _compute_system_metrics(
|
def _compute_system_metrics(
|
||||||
self,
|
self,
|
||||||
metrics: Type[System],
|
metrics: Type[System],
|
||||||
@ -49,6 +65,7 @@ class BigQueryProfilerInterface(SQAProfilerInterface):
|
|||||||
session=self.session,
|
session=self.session,
|
||||||
runner=runner,
|
runner=runner,
|
||||||
usage_location=self.service_connection_config.usageLocation,
|
usage_location=self.service_connection_config.usageLocation,
|
||||||
|
billing_project_id=self.service_connection_config.billingProjectId,
|
||||||
)
|
)
|
||||||
return instance.get_system_metrics()
|
return instance.get_system_metrics()
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
"""BigQuery system metric source"""
|
"""BigQuery system metric source"""
|
||||||
|
|
||||||
from typing import List
|
from typing import List, Optional
|
||||||
|
|
||||||
from pydantic import TypeAdapter
|
from pydantic import TypeAdapter
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
@ -30,12 +30,14 @@ class BigQuerySystemMetricsComputer(SystemMetricsComputer, CacheProvider):
|
|||||||
session: Session,
|
session: Session,
|
||||||
runner: QueryRunner,
|
runner: QueryRunner,
|
||||||
usage_location: str,
|
usage_location: str,
|
||||||
|
billing_project_id: Optional[str] = None,
|
||||||
):
|
):
|
||||||
self.session = session
|
self.session = session
|
||||||
self.table = runner.table_name
|
self.table = runner.table_name
|
||||||
self.project_id = runner.session.get_bind().url.host
|
self.project_id = runner.session.get_bind().url.host
|
||||||
self.dataset_id = runner.schema_name
|
self.dataset_id = runner.schema_name
|
||||||
self.usage_location = usage_location
|
self.usage_location = usage_location
|
||||||
|
self.billing_project_id = billing_project_id or self.project_id
|
||||||
|
|
||||||
def get_deletes(self) -> List[SystemProfile]:
|
def get_deletes(self) -> List[SystemProfile]:
|
||||||
return self.get_system_profile(
|
return self.get_system_profile(
|
||||||
@ -116,6 +118,7 @@ class BigQuerySystemMetricsComputer(SystemMetricsComputer, CacheProvider):
|
|||||||
usage_location=usage_location,
|
usage_location=usage_location,
|
||||||
project_id=project_id,
|
project_id=project_id,
|
||||||
dataset_id=dataset_id,
|
dataset_id=dataset_id,
|
||||||
|
billing_project_id=self.billing_project_id,
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
@ -304,8 +304,7 @@ class BigQueryTableMetricComputer(BaseTableMetricComputer):
|
|||||||
]
|
]
|
||||||
|
|
||||||
where_clause = [
|
where_clause = [
|
||||||
Column("project_id")
|
Column("project_id") == self._entity.database.name,
|
||||||
== self.conn_config.credentials.gcpConfig.projectId.root,
|
|
||||||
Column("table_schema") == self.schema_name,
|
Column("table_schema") == self.schema_name,
|
||||||
Column("table_name") == self.table_name,
|
Column("table_name") == self.table_name,
|
||||||
]
|
]
|
||||||
@ -338,17 +337,14 @@ class BigQueryTableMetricComputer(BaseTableMetricComputer):
|
|||||||
*self._get_col_names_and_count(),
|
*self._get_col_names_and_count(),
|
||||||
]
|
]
|
||||||
where_clause = [
|
where_clause = [
|
||||||
Column("project_id")
|
Column("project_id") == self._entity.database.name,
|
||||||
== self.conn_config.credentials.gcpConfig.projectId.root,
|
|
||||||
Column("dataset_id") == self.schema_name,
|
Column("dataset_id") == self.schema_name,
|
||||||
Column("table_id") == self.table_name,
|
Column("table_id") == self.table_name,
|
||||||
]
|
]
|
||||||
schema = (
|
schema = (
|
||||||
self.schema_name.startswith(
|
self.schema_name.startswith(f"{self._entity.database.name}.")
|
||||||
f"{self.conn_config.credentials.gcpConfig.projectId.root}."
|
|
||||||
)
|
|
||||||
and self.schema_name
|
and self.schema_name
|
||||||
or f"{self.conn_config.credentials.gcpConfig.projectId.root}.{self.schema_name}"
|
or f"{self._entity.database.name}.{self.schema_name}"
|
||||||
)
|
)
|
||||||
query = self._build_query(
|
query = self._build_query(
|
||||||
columns,
|
columns,
|
||||||
|
@ -12,6 +12,7 @@
|
|||||||
Helper module to handle data sampling
|
Helper module to handle data sampling
|
||||||
for the profiler
|
for the profiler
|
||||||
"""
|
"""
|
||||||
|
from copy import deepcopy
|
||||||
from typing import Dict, Optional, Union
|
from typing import Dict, Optional, Union
|
||||||
|
|
||||||
from sqlalchemy import Column
|
from sqlalchemy import Column
|
||||||
@ -31,11 +32,14 @@ from metadata.generated.schema.entity.services.connections.database.datalakeConn
|
|||||||
DatalakeConnection,
|
DatalakeConnection,
|
||||||
)
|
)
|
||||||
from metadata.generated.schema.entity.services.databaseService import DatabaseConnection
|
from metadata.generated.schema.entity.services.databaseService import DatabaseConnection
|
||||||
|
from metadata.generated.schema.security.credentials.gcpValues import SingleProjectId
|
||||||
|
from metadata.ingestion.connections.session import create_and_bind_thread_safe_session
|
||||||
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
from metadata.ingestion.ometa.ometa_api import OpenMetadata
|
||||||
from metadata.sampler.models import SampleConfig
|
from metadata.sampler.models import SampleConfig
|
||||||
from metadata.sampler.sqlalchemy.sampler import SQASampler
|
from metadata.sampler.sqlalchemy.sampler import SQASampler
|
||||||
from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT
|
from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT
|
||||||
from metadata.utils.logger import profiler_interface_registry_logger
|
from metadata.utils.logger import profiler_interface_registry_logger
|
||||||
|
from metadata.utils.ssl_manager import get_ssl_connection
|
||||||
|
|
||||||
logger = profiler_interface_registry_logger()
|
logger = profiler_interface_registry_logger()
|
||||||
|
|
||||||
@ -74,6 +78,19 @@ class BigQuerySampler(SQASampler):
|
|||||||
)
|
)
|
||||||
self.raw_dataset_type: Optional[TableType] = entity.tableType
|
self.raw_dataset_type: Optional[TableType] = entity.tableType
|
||||||
|
|
||||||
|
connection_config = deepcopy(service_connection_config)
|
||||||
|
# Create a modified connection for BigQuery with the correct project ID
|
||||||
|
if (
|
||||||
|
hasattr(connection_config.credentials.gcpConfig, "projectId")
|
||||||
|
and self.entity.database
|
||||||
|
):
|
||||||
|
connection_config.credentials.gcpConfig.projectId = SingleProjectId(
|
||||||
|
root=self.entity.database.name
|
||||||
|
)
|
||||||
|
self.connection = get_ssl_connection(connection_config)
|
||||||
|
|
||||||
|
self.session_factory = create_and_bind_thread_safe_session(self.connection)
|
||||||
|
|
||||||
def set_tablesample(self, selectable: SqaTable):
|
def set_tablesample(self, selectable: SqaTable):
|
||||||
"""Set the TABLESAMPLE clause for BigQuery
|
"""Set the TABLESAMPLE clause for BigQuery
|
||||||
Args:
|
Args:
|
||||||
|
@ -8,6 +8,7 @@ source:
|
|||||||
taxonomyProjectID:
|
taxonomyProjectID:
|
||||||
- $E2E_BQ_PROJECT_ID
|
- $E2E_BQ_PROJECT_ID
|
||||||
- $E2E_BQ_PROJECT_ID2
|
- $E2E_BQ_PROJECT_ID2
|
||||||
|
billingProjectId: $E2E_BQ_PROJECT_ID2
|
||||||
credentials:
|
credentials:
|
||||||
gcpConfig:
|
gcpConfig:
|
||||||
type: service_account
|
type: service_account
|
||||||
|
@ -22,6 +22,7 @@ from metadata.generated.schema.security.credentials.gcpCredentials import GCPCre
|
|||||||
from metadata.generated.schema.security.credentials.gcpValues import (
|
from metadata.generated.schema.security.credentials.gcpValues import (
|
||||||
GcpCredentialsValues,
|
GcpCredentialsValues,
|
||||||
)
|
)
|
||||||
|
from metadata.generated.schema.type.entityReference import EntityReference
|
||||||
from metadata.profiler.interface.sqlalchemy.profiler_interface import (
|
from metadata.profiler.interface.sqlalchemy.profiler_interface import (
|
||||||
SQAProfilerInterface,
|
SQAProfilerInterface,
|
||||||
)
|
)
|
||||||
@ -69,6 +70,7 @@ class SampleTest(TestCase):
|
|||||||
dataType=DataType.INT,
|
dataType=DataType.INT,
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
|
database=EntityReference(id=uuid4(), name="myproject", type="database"),
|
||||||
)
|
)
|
||||||
|
|
||||||
cls.bq_conn = BigQueryConnection(
|
cls.bq_conn = BigQueryConnection(
|
||||||
@ -137,6 +139,7 @@ class SampleTest(TestCase):
|
|||||||
),
|
),
|
||||||
],
|
],
|
||||||
tableType=TableType.View,
|
tableType=TableType.View,
|
||||||
|
database=EntityReference(id=uuid4(), name="myproject", type="database"),
|
||||||
)
|
)
|
||||||
|
|
||||||
sampler = BigQuerySampler(
|
sampler = BigQuerySampler(
|
||||||
@ -172,6 +175,7 @@ class SampleTest(TestCase):
|
|||||||
),
|
),
|
||||||
],
|
],
|
||||||
tableType=TableType.View,
|
tableType=TableType.View,
|
||||||
|
database=EntityReference(id=uuid4(), name="myproject", type="database"),
|
||||||
)
|
)
|
||||||
|
|
||||||
sampler = BigQuerySampler(
|
sampler = BigQuerySampler(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user