mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-12 17:34:18 +00:00
feat(ingest): measure sink bottlenecking (#10628)
This commit is contained in:
parent
8f40229da2
commit
fcab544f17
@ -1,9 +1,9 @@
|
|||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import contextlib
|
import contextlib
|
||||||
|
import dataclasses
|
||||||
import functools
|
import functools
|
||||||
import logging
|
import logging
|
||||||
import uuid
|
import uuid
|
||||||
from dataclasses import dataclass
|
|
||||||
from enum import auto
|
from enum import auto
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
@ -29,6 +29,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
|
|||||||
MetadataChangeProposal,
|
MetadataChangeProposal,
|
||||||
)
|
)
|
||||||
from datahub.utilities.advanced_thread_executor import PartitionExecutor
|
from datahub.utilities.advanced_thread_executor import PartitionExecutor
|
||||||
|
from datahub.utilities.perf_timer import PerfTimer
|
||||||
from datahub.utilities.server_config_util import set_gms_config
|
from datahub.utilities.server_config_util import set_gms_config
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -44,15 +45,17 @@ class DatahubRestSinkConfig(DatahubClientConfig):
|
|||||||
|
|
||||||
# These only apply in async mode.
|
# These only apply in async mode.
|
||||||
max_threads: int = 15
|
max_threads: int = 15
|
||||||
max_pending_requests: int = 500
|
max_pending_requests: int = 2000
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclasses.dataclass
|
||||||
class DataHubRestSinkReport(SinkReport):
|
class DataHubRestSinkReport(SinkReport):
|
||||||
max_threads: int = -1
|
max_threads: Optional[int] = None
|
||||||
gms_version: str = ""
|
gms_version: Optional[str] = None
|
||||||
pending_requests: int = 0
|
pending_requests: int = 0
|
||||||
|
|
||||||
|
main_thread_blocking_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
|
||||||
|
|
||||||
def compute_stats(self) -> None:
|
def compute_stats(self) -> None:
|
||||||
super().compute_stats()
|
super().compute_stats()
|
||||||
|
|
||||||
@ -105,7 +108,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|||||||
self.report.gms_version = (
|
self.report.gms_version = (
|
||||||
gms_config.get("versions", {})
|
gms_config.get("versions", {})
|
||||||
.get("acryldata/datahub", {})
|
.get("acryldata/datahub", {})
|
||||||
.get("version", "")
|
.get("version", None)
|
||||||
)
|
)
|
||||||
self.report.max_threads = self.config.max_threads
|
self.report.max_threads = self.config.max_threads
|
||||||
logger.debug("Setting env variables to override config")
|
logger.debug("Setting env variables to override config")
|
||||||
@ -189,25 +192,28 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
|
|||||||
],
|
],
|
||||||
write_callback: WriteCallback,
|
write_callback: WriteCallback,
|
||||||
) -> None:
|
) -> None:
|
||||||
record = record_envelope.record
|
# Because the default is async mode and most sources are slower than the sink, this
|
||||||
if self.config.mode == SyncOrAsync.ASYNC:
|
# should only have a high value if the sink is actually a bottleneck.
|
||||||
partition_key = _get_partition_key(record_envelope)
|
with self.report.main_thread_blocking_timer:
|
||||||
self.executor.submit(
|
record = record_envelope.record
|
||||||
partition_key,
|
if self.config.mode == SyncOrAsync.ASYNC:
|
||||||
self._emit_wrapper,
|
partition_key = _get_partition_key(record_envelope)
|
||||||
record,
|
self.executor.submit(
|
||||||
done_callback=functools.partial(
|
partition_key,
|
||||||
self._write_done_callback, record_envelope, write_callback
|
self._emit_wrapper,
|
||||||
),
|
record,
|
||||||
)
|
done_callback=functools.partial(
|
||||||
self.report.pending_requests += 1
|
self._write_done_callback, record_envelope, write_callback
|
||||||
else:
|
),
|
||||||
# execute synchronously
|
)
|
||||||
try:
|
self.report.pending_requests += 1
|
||||||
self._emit_wrapper(record)
|
else:
|
||||||
write_callback.on_success(record_envelope, success_metadata={})
|
# execute synchronously
|
||||||
except Exception as e:
|
try:
|
||||||
write_callback.on_failure(record_envelope, e, failure_metadata={})
|
self._emit_wrapper(record)
|
||||||
|
write_callback.on_success(record_envelope, success_metadata={})
|
||||||
|
except Exception as e:
|
||||||
|
write_callback.on_failure(record_envelope, e, failure_metadata={})
|
||||||
|
|
||||||
def emit_async(
|
def emit_async(
|
||||||
self,
|
self,
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user