feat(ingest): measure sink bottlenecking (#10628)

This commit is contained in:
Harshal Sheth 2024-06-04 09:06:48 -07:00 committed by GitHub
parent 8f40229da2
commit fcab544f17
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,9 +1,9 @@
import concurrent.futures import concurrent.futures
import contextlib import contextlib
import dataclasses
import functools import functools
import logging import logging
import uuid import uuid
from dataclasses import dataclass
from enum import auto from enum import auto
from typing import Optional, Union from typing import Optional, Union
@ -29,6 +29,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
MetadataChangeProposal, MetadataChangeProposal,
) )
from datahub.utilities.advanced_thread_executor import PartitionExecutor from datahub.utilities.advanced_thread_executor import PartitionExecutor
from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.server_config_util import set_gms_config from datahub.utilities.server_config_util import set_gms_config
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -44,15 +45,17 @@ class DatahubRestSinkConfig(DatahubClientConfig):
# These only apply in async mode. # These only apply in async mode.
max_threads: int = 15 max_threads: int = 15
max_pending_requests: int = 500 max_pending_requests: int = 2000
@dataclass @dataclasses.dataclass
class DataHubRestSinkReport(SinkReport): class DataHubRestSinkReport(SinkReport):
max_threads: int = -1 max_threads: Optional[int] = None
gms_version: str = "" gms_version: Optional[str] = None
pending_requests: int = 0 pending_requests: int = 0
main_thread_blocking_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
def compute_stats(self) -> None: def compute_stats(self) -> None:
super().compute_stats() super().compute_stats()
@ -105,7 +108,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
self.report.gms_version = ( self.report.gms_version = (
gms_config.get("versions", {}) gms_config.get("versions", {})
.get("acryldata/datahub", {}) .get("acryldata/datahub", {})
.get("version", "") .get("version", None)
) )
self.report.max_threads = self.config.max_threads self.report.max_threads = self.config.max_threads
logger.debug("Setting env variables to override config") logger.debug("Setting env variables to override config")
@ -189,25 +192,28 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
], ],
write_callback: WriteCallback, write_callback: WriteCallback,
) -> None: ) -> None:
record = record_envelope.record # Because the default is async mode and most sources are slower than the sink, this
if self.config.mode == SyncOrAsync.ASYNC: # should only have a high value if the sink is actually a bottleneck.
partition_key = _get_partition_key(record_envelope) with self.report.main_thread_blocking_timer:
self.executor.submit( record = record_envelope.record
partition_key, if self.config.mode == SyncOrAsync.ASYNC:
self._emit_wrapper, partition_key = _get_partition_key(record_envelope)
record, self.executor.submit(
done_callback=functools.partial( partition_key,
self._write_done_callback, record_envelope, write_callback self._emit_wrapper,
), record,
) done_callback=functools.partial(
self.report.pending_requests += 1 self._write_done_callback, record_envelope, write_callback
else: ),
# execute synchronously )
try: self.report.pending_requests += 1
self._emit_wrapper(record) else:
write_callback.on_success(record_envelope, success_metadata={}) # execute synchronously
except Exception as e: try:
write_callback.on_failure(record_envelope, e, failure_metadata={}) self._emit_wrapper(record)
write_callback.on_success(record_envelope, success_metadata={})
except Exception as e:
write_callback.on_failure(record_envelope, e, failure_metadata={})
def emit_async( def emit_async(
self, self,