mirror of
https://github.com/datahub-project/datahub.git
synced 2025-12-16 12:38:13 +00:00
ingest(mysql): add storage bytes information (#8294)
Co-authored-by: Andrew Sikowitz <andrew.sikowitz@acryl.io>
This commit is contained in:
parent
b5e039ff4e
commit
f4c0ed3aab
@ -5,6 +5,7 @@ from pydantic.fields import Field
|
|||||||
from sqlalchemy import util
|
from sqlalchemy import util
|
||||||
from sqlalchemy.dialects.mysql import base
|
from sqlalchemy.dialects.mysql import base
|
||||||
from sqlalchemy.dialects.mysql.enumerated import SET
|
from sqlalchemy.dialects.mysql.enumerated import SET
|
||||||
|
from sqlalchemy.engine.reflection import Inspector
|
||||||
|
|
||||||
from datahub.ingestion.api.decorators import (
|
from datahub.ingestion.api.decorators import (
|
||||||
SourceCapability,
|
SourceCapability,
|
||||||
@ -83,3 +84,14 @@ class MySQLSource(TwoTierSQLAlchemySource):
|
|||||||
def create(cls, config_dict, ctx):
|
def create(cls, config_dict, ctx):
|
||||||
config = MySQLConfig.parse_obj(config_dict)
|
config = MySQLConfig.parse_obj(config_dict)
|
||||||
return cls(config, ctx)
|
return cls(config, ctx)
|
||||||
|
|
||||||
|
def add_profile_metadata(self, inspector: Inspector) -> None:
|
||||||
|
if not self.config.profiling.enabled:
|
||||||
|
return
|
||||||
|
with inspector.engine.connect() as conn:
|
||||||
|
for row in conn.execute(
|
||||||
|
"SELECT table_schema, table_name, data_length from information_schema.tables"
|
||||||
|
):
|
||||||
|
self.profile_metadata_info.dataset_name_to_storage_bytes[
|
||||||
|
f"{row.table_schema}.{row.table_name}"
|
||||||
|
] = row.data_length
|
||||||
|
|||||||
@ -309,6 +309,15 @@ config_options_to_report = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ProfileMetadata:
|
||||||
|
"""
|
||||||
|
A class to hold information about the table for profile enrichment
|
||||||
|
"""
|
||||||
|
|
||||||
|
dataset_name_to_storage_bytes: Dict[str, int] = field(default_factory=dict)
|
||||||
|
|
||||||
|
|
||||||
class SQLAlchemySource(StatefulIngestionSourceBase):
|
class SQLAlchemySource(StatefulIngestionSourceBase):
|
||||||
"""A Base class for all SQL Sources that use SQLAlchemy to extend"""
|
"""A Base class for all SQL Sources that use SQLAlchemy to extend"""
|
||||||
|
|
||||||
@ -317,6 +326,7 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
|
|||||||
self.config = config
|
self.config = config
|
||||||
self.platform = platform
|
self.platform = platform
|
||||||
self.report: SQLSourceReport = SQLSourceReport()
|
self.report: SQLSourceReport = SQLSourceReport()
|
||||||
|
self.profile_metadata_info: ProfileMetadata = ProfileMetadata()
|
||||||
|
|
||||||
config_report = {
|
config_report = {
|
||||||
config_option: config.dict().get(config_option)
|
config_option: config.dict().get(config_option)
|
||||||
@ -484,6 +494,16 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
|
|||||||
profile_requests: List["GEProfilerRequest"] = []
|
profile_requests: List["GEProfilerRequest"] = []
|
||||||
if sql_config.profiling.enabled:
|
if sql_config.profiling.enabled:
|
||||||
profiler = self.get_profiler_instance(inspector)
|
profiler = self.get_profiler_instance(inspector)
|
||||||
|
try:
|
||||||
|
self.add_profile_metadata(inspector)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"Failed to get enrichment data for profiler", exc_info=True
|
||||||
|
)
|
||||||
|
self.report.report_warning(
|
||||||
|
"profile_metadata",
|
||||||
|
f"Failed to get enrichment data for profile {e}",
|
||||||
|
)
|
||||||
|
|
||||||
db_name = self.get_db_name(inspector)
|
db_name = self.get_db_name(inspector)
|
||||||
yield from self.gen_database_containers(
|
yield from self.gen_database_containers(
|
||||||
@ -1098,6 +1118,13 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def add_profile_metadata(self, inspector: Inspector) -> None:
|
||||||
|
"""
|
||||||
|
Method to add profile metadata in a sub-class that can be used to enrich profile metadata.
|
||||||
|
This is meant to change self.profile_metadata_info in the sub-class.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
def loop_profiler(
|
def loop_profiler(
|
||||||
self,
|
self,
|
||||||
profile_requests: List["GEProfilerRequest"],
|
profile_requests: List["GEProfilerRequest"],
|
||||||
@ -1113,6 +1140,15 @@ class SQLAlchemySource(StatefulIngestionSourceBase):
|
|||||||
if profile is None:
|
if profile is None:
|
||||||
continue
|
continue
|
||||||
dataset_name = request.pretty_name
|
dataset_name = request.pretty_name
|
||||||
|
if (
|
||||||
|
dataset_name in self.profile_metadata_info.dataset_name_to_storage_bytes
|
||||||
|
and profile.sizeInBytes is None
|
||||||
|
):
|
||||||
|
profile.sizeInBytes = (
|
||||||
|
self.profile_metadata_info.dataset_name_to_storage_bytes[
|
||||||
|
dataset_name
|
||||||
|
]
|
||||||
|
)
|
||||||
dataset_urn = make_dataset_urn_with_platform_instance(
|
dataset_urn = make_dataset_urn_with_platform_instance(
|
||||||
self.platform,
|
self.platform,
|
||||||
dataset_name,
|
dataset_name,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user