feat(profiler): add query combiner report statistics (#3678)

This commit is contained in:
Harshal Sheth 2021-12-08 00:38:40 -05:00 committed by GitHub
parent 1c17ba76d2
commit a9ce255abf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 49 additions and 2 deletions

View File

@ -1,15 +1,26 @@
import json
import pprint
import sys
from dataclasses import dataclass
from typing import Dict
# The sort_dicts option was added in Python 3.8.
if sys.version_info >= (3, 8):
PPRINT_OPTIONS = {"sort_dicts": False}
else:
PPRINT_OPTIONS: Dict = {}
@dataclass
class Report:
def as_obj(self) -> dict:
return self.__dict__
return {
key: value.as_obj() if hasattr(value, "as_obj") else value
for (key, value) in self.__dict__.items()
}
def as_string(self) -> str:
return pprint.pformat(self.as_obj(), width=150)
return pprint.pformat(self.as_obj(), width=150, **PPRINT_OPTIONS)
def as_json(self) -> str:
return json.dumps(self.as_obj())

View File

@ -722,6 +722,8 @@ class DatahubGEProfiler:
f"Profiling {len(requests)} table(s) finished in {(timer.elapsed_seconds()):.3f} seconds"
)
self.report.report_from_query_combiner(query_combiner.report)
def _generate_profile_from_request(
self,
query_combiner: SQLAlchemyQueryCombiner,

View File

@ -47,6 +47,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.schema import (
TimeTypeClass,
)
from datahub.metadata.schema_classes import ChangeTypeClass, DatasetPropertiesClass
from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport
if TYPE_CHECKING:
from datahub.ingestion.source.ge_data_profiler import (
@ -124,6 +125,8 @@ class SQLSourceReport(SourceReport):
views_scanned: int = 0
filtered: List[str] = field(default_factory=list)
query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None
def report_entity_scanned(self, name: str, ent_type: str = "table") -> None:
"""
Entity could be a view or a table
@ -138,6 +141,11 @@ class SQLSourceReport(SourceReport):
def report_dropped(self, ent_name: str) -> None:
self.filtered.append(ent_name)
def report_from_query_combiner(
self, query_combiner_report: SQLAlchemyQueryCombinerReport
) -> None:
self.query_combiner = query_combiner_report
class SQLAlchemyConfig(ConfigModel):
env: str = DEFAULT_ENV

View File

@ -17,6 +17,8 @@ from sqlalchemy.engine import Connection
from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound
from typing_extensions import ParamSpec
from datahub.ingestion.api.report import Report
logger: logging.Logger = logging.getLogger(__name__)
P = ParamSpec("P")
@ -113,6 +115,17 @@ def get_query_columns(query: Any) -> List[Any]:
return list(query.columns)
@dataclasses.dataclass
class SQLAlchemyQueryCombinerReport(Report):
total_queries: int = 0
uncombined_queries_issued: int = 0
combined_queries_issued: int = 0
queries_combined: int = 0
query_exceptions: int = 0
@dataclasses.dataclass
class SQLAlchemyQueryCombiner:
"""
@ -126,6 +139,12 @@ class SQLAlchemyQueryCombiner:
is_single_row_query_method: Callable[[Any], bool]
serial_execution_fallback_enabled: bool
# The Python GIL ensures that modifications to the report's counters
# are safe.
report: SQLAlchemyQueryCombinerReport = dataclasses.field(
default_factory=SQLAlchemyQueryCombinerReport
)
# There will be one main greenlet per thread. As such, queries will be
# queued according to the main greenlet's thread ID. We also keep track
# of the greenlets we spawn for bookkeeping purposes.
@ -202,6 +221,7 @@ class SQLAlchemyQueryCombiner:
query_id = SQLAlchemyQueryCombiner._generate_sql_safe_identifier()
query_future = _QueryFuture(conn, query, multiparams, params)
queue[query_id] = query_future
self.report.queries_combined += 1
# Yield control back to the main greenlet until the query is done.
# We assume that the main greenlet will be the one that actually executes the query.
@ -217,6 +237,7 @@ class SQLAlchemyQueryCombiner:
conn: Connection, query: Any, *args: Any, **kwargs: Any
) -> Any:
try:
self.report.total_queries += 1
handled, result = self._handle_execute(conn, query, args, kwargs)
except Exception as e:
if not self.catch_exceptions:
@ -224,6 +245,7 @@ class SQLAlchemyQueryCombiner:
logger.exception(
f"Failed to execute query normally, using fallback: {str(query)}"
)
self.report.query_exceptions += 1
return _sa_execute_underlying_method(conn, query, *args, **kwargs)
else:
if handled:
@ -234,6 +256,7 @@ class SQLAlchemyQueryCombiner:
return result.res
else:
logger.debug(f"Executing query normally: {str(query)}")
self.report.uncombined_queries_issued += 1
return _sa_execute_underlying_method(conn, query, *args, **kwargs)
with _sa_execute_method_patching_lock:
@ -289,6 +312,7 @@ class SQLAlchemyQueryCombiner:
combined_query.append_from(cte)
logger.debug(f"Executing combined query: {str(combined_query)}")
self.report.combined_queries_issued += 1
sa_res = _sa_execute_underlying_method(queue_item.conn, combined_query)
# Fetch the results and ensure that exactly one row is returned.
@ -322,6 +346,7 @@ class SQLAlchemyQueryCombiner:
continue
logger.debug(f"Executing query via fallback: {str(query_future.query)}")
self.report.uncombined_queries_issued += 1
try:
res = _sa_execute_underlying_method(
query_future.conn,
@ -351,6 +376,7 @@ class SQLAlchemyQueryCombiner:
if not self.serial_execution_fallback_enabled:
raise e
logger.exception(f"Failed to execute queue using combiner: {str(e)}")
self.report.query_exceptions += 1
self._execute_queue_fallback(main_greenlet)
for let in list(pool):