mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-26 17:45:30 +00:00
feat(profiler): add query combiner report statistics (#3678)
This commit is contained in:
parent
1c17ba76d2
commit
a9ce255abf
@ -1,15 +1,26 @@
|
|||||||
import json
|
import json
|
||||||
import pprint
|
import pprint
|
||||||
|
import sys
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
# The sort_dicts option was added in Python 3.8.
|
||||||
|
if sys.version_info >= (3, 8):
|
||||||
|
PPRINT_OPTIONS = {"sort_dicts": False}
|
||||||
|
else:
|
||||||
|
PPRINT_OPTIONS: Dict = {}
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class Report:
|
class Report:
|
||||||
def as_obj(self) -> dict:
|
def as_obj(self) -> dict:
|
||||||
return self.__dict__
|
return {
|
||||||
|
key: value.as_obj() if hasattr(value, "as_obj") else value
|
||||||
|
for (key, value) in self.__dict__.items()
|
||||||
|
}
|
||||||
|
|
||||||
def as_string(self) -> str:
|
def as_string(self) -> str:
|
||||||
return pprint.pformat(self.as_obj(), width=150)
|
return pprint.pformat(self.as_obj(), width=150, **PPRINT_OPTIONS)
|
||||||
|
|
||||||
def as_json(self) -> str:
|
def as_json(self) -> str:
|
||||||
return json.dumps(self.as_obj())
|
return json.dumps(self.as_obj())
|
||||||
|
@ -722,6 +722,8 @@ class DatahubGEProfiler:
|
|||||||
f"Profiling {len(requests)} table(s) finished in {(timer.elapsed_seconds()):.3f} seconds"
|
f"Profiling {len(requests)} table(s) finished in {(timer.elapsed_seconds()):.3f} seconds"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
self.report.report_from_query_combiner(query_combiner.report)
|
||||||
|
|
||||||
def _generate_profile_from_request(
|
def _generate_profile_from_request(
|
||||||
self,
|
self,
|
||||||
query_combiner: SQLAlchemyQueryCombiner,
|
query_combiner: SQLAlchemyQueryCombiner,
|
||||||
|
@ -47,6 +47,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
|
|||||||
TimeTypeClass,
|
TimeTypeClass,
|
||||||
)
|
)
|
||||||
from datahub.metadata.schema_classes import ChangeTypeClass, DatasetPropertiesClass
|
from datahub.metadata.schema_classes import ChangeTypeClass, DatasetPropertiesClass
|
||||||
|
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from datahub.ingestion.source.ge_data_profiler import (
|
from datahub.ingestion.source.ge_data_profiler import (
|
||||||
@ -124,6 +125,8 @@ class SQLSourceReport(SourceReport):
|
|||||||
views_scanned: int = 0
|
views_scanned: int = 0
|
||||||
filtered: List[str] = field(default_factory=list)
|
filtered: List[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
|
||||||
|
|
||||||
def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
|
def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
|
||||||
"""
|
"""
|
||||||
Entity could be a view or a table
|
Entity could be a view or a table
|
||||||
@ -138,6 +141,11 @@ class SQLSourceReport(SourceReport):
|
|||||||
def report_dropped(self, ent_name: str) -> None:
|
def report_dropped(self, ent_name: str) -> None:
|
||||||
self.filtered.append(ent_name)
|
self.filtered.append(ent_name)
|
||||||
|
|
||||||
|
def report_from_query_combiner(
|
||||||
|
self, query_combiner_report: SQLAlchemyQueryCombinerReport
|
||||||
|
) -> None:
|
||||||
|
self.query_combiner = query_combiner_report
|
||||||
|
|
||||||
|
|
||||||
class SQLAlchemyConfig(ConfigModel):
|
class SQLAlchemyConfig(ConfigModel):
|
||||||
env: str = DEFAULT_ENV
|
env: str = DEFAULT_ENV
|
||||||
|
@ -17,6 +17,8 @@ from sqlalchemy.engine import Connection
|
|||||||
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound
|
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound
|
||||||
from typing_extensions import ParamSpec
|
from typing_extensions import ParamSpec
|
||||||
|
|
||||||
|
from datahub.ingestion.api.report import Report
|
||||||
|
|
||||||
logger: logging.Logger = logging.getLogger(__name__)
|
logger: logging.Logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
P = ParamSpec("P")
|
P = ParamSpec("P")
|
||||||
@ -113,6 +115,17 @@ def get_query_columns(query: Any) -> List[Any]:
|
|||||||
return list(query.columns)
|
return list(query.columns)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclasses.dataclass
|
||||||
|
class SQLAlchemyQueryCombinerReport(Report):
|
||||||
|
total_queries: int = 0
|
||||||
|
uncombined_queries_issued: int = 0
|
||||||
|
|
||||||
|
combined_queries_issued: int = 0
|
||||||
|
queries_combined: int = 0
|
||||||
|
|
||||||
|
query_exceptions: int = 0
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
class SQLAlchemyQueryCombiner:
|
class SQLAlchemyQueryCombiner:
|
||||||
"""
|
"""
|
||||||
@ -126,6 +139,12 @@ class SQLAlchemyQueryCombiner:
|
|||||||
is_single_row_query_method: Callable[[Any], bool]
|
is_single_row_query_method: Callable[[Any], bool]
|
||||||
serial_execution_fallback_enabled: bool
|
serial_execution_fallback_enabled: bool
|
||||||
|
|
||||||
|
# The Python GIL ensures that modifications to the report's counters
|
||||||
|
# are safe.
|
||||||
|
report: SQLAlchemyQueryCombinerReport = dataclasses.field(
|
||||||
|
default_factory=SQLAlchemyQueryCombinerReport
|
||||||
|
)
|
||||||
|
|
||||||
# There will be one main greenlet per thread. As such, queries will be
|
# There will be one main greenlet per thread. As such, queries will be
|
||||||
# queued according to the main greenlet's thread ID. We also keep track
|
# queued according to the main greenlet's thread ID. We also keep track
|
||||||
# of the greenlets we spawn for bookkeeping purposes.
|
# of the greenlets we spawn for bookkeeping purposes.
|
||||||
@ -202,6 +221,7 @@ class SQLAlchemyQueryCombiner:
|
|||||||
query_id = SQLAlchemyQueryCombiner._generate_sql_safe_identifier()
|
query_id = SQLAlchemyQueryCombiner._generate_sql_safe_identifier()
|
||||||
query_future = _QueryFuture(conn, query, multiparams, params)
|
query_future = _QueryFuture(conn, query, multiparams, params)
|
||||||
queue[query_id] = query_future
|
queue[query_id] = query_future
|
||||||
|
self.report.queries_combined += 1
|
||||||
|
|
||||||
# Yield control back to the main greenlet until the query is done.
|
# Yield control back to the main greenlet until the query is done.
|
||||||
# We assume that the main greenlet will be the one that actually executes the query.
|
# We assume that the main greenlet will be the one that actually executes the query.
|
||||||
@ -217,6 +237,7 @@ class SQLAlchemyQueryCombiner:
|
|||||||
conn: Connection, query: Any, *args: Any, **kwargs: Any
|
conn: Connection, query: Any, *args: Any, **kwargs: Any
|
||||||
) -> Any:
|
) -> Any:
|
||||||
try:
|
try:
|
||||||
|
self.report.total_queries += 1
|
||||||
handled, result = self._handle_execute(conn, query, args, kwargs)
|
handled, result = self._handle_execute(conn, query, args, kwargs)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if not self.catch_exceptions:
|
if not self.catch_exceptions:
|
||||||
@ -224,6 +245,7 @@ class SQLAlchemyQueryCombiner:
|
|||||||
logger.exception(
|
logger.exception(
|
||||||
f"Failed to execute query normally, using fallback: {str(query)}"
|
f"Failed to execute query normally, using fallback: {str(query)}"
|
||||||
)
|
)
|
||||||
|
self.report.query_exceptions += 1
|
||||||
return _sa_execute_underlying_method(conn, query, *args, **kwargs)
|
return _sa_execute_underlying_method(conn, query, *args, **kwargs)
|
||||||
else:
|
else:
|
||||||
if handled:
|
if handled:
|
||||||
@ -234,6 +256,7 @@ class SQLAlchemyQueryCombiner:
|
|||||||
return result.res
|
return result.res
|
||||||
else:
|
else:
|
||||||
logger.debug(f"Executing query normally: {str(query)}")
|
logger.debug(f"Executing query normally: {str(query)}")
|
||||||
|
self.report.uncombined_queries_issued += 1
|
||||||
return _sa_execute_underlying_method(conn, query, *args, **kwargs)
|
return _sa_execute_underlying_method(conn, query, *args, **kwargs)
|
||||||
|
|
||||||
with _sa_execute_method_patching_lock:
|
with _sa_execute_method_patching_lock:
|
||||||
@ -289,6 +312,7 @@ class SQLAlchemyQueryCombiner:
|
|||||||
combined_query.append_from(cte)
|
combined_query.append_from(cte)
|
||||||
|
|
||||||
logger.debug(f"Executing combined query: {str(combined_query)}")
|
logger.debug(f"Executing combined query: {str(combined_query)}")
|
||||||
|
self.report.combined_queries_issued += 1
|
||||||
sa_res = _sa_execute_underlying_method(queue_item.conn, combined_query)
|
sa_res = _sa_execute_underlying_method(queue_item.conn, combined_query)
|
||||||
|
|
||||||
# Fetch the results and ensure that exactly one row is returned.
|
# Fetch the results and ensure that exactly one row is returned.
|
||||||
@ -322,6 +346,7 @@ class SQLAlchemyQueryCombiner:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
logger.debug(f"Executing query via fallback: {str(query_future.query)}")
|
logger.debug(f"Executing query via fallback: {str(query_future.query)}")
|
||||||
|
self.report.uncombined_queries_issued += 1
|
||||||
try:
|
try:
|
||||||
res = _sa_execute_underlying_method(
|
res = _sa_execute_underlying_method(
|
||||||
query_future.conn,
|
query_future.conn,
|
||||||
@ -351,6 +376,7 @@ class SQLAlchemyQueryCombiner:
|
|||||||
if not self.serial_execution_fallback_enabled:
|
if not self.serial_execution_fallback_enabled:
|
||||||
raise e
|
raise e
|
||||||
logger.exception(f"Failed to execute queue using combiner: {str(e)}")
|
logger.exception(f"Failed to execute queue using combiner: {str(e)}")
|
||||||
|
self.report.query_exceptions += 1
|
||||||
self._execute_queue_fallback(main_greenlet)
|
self._execute_queue_fallback(main_greenlet)
|
||||||
|
|
||||||
for let in list(pool):
|
for let in list(pool):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user