mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-11-04 04:39:10 +00:00 
			
		
		
		
	feat(ingest): measure sink bottlenecking (#10628)
This commit is contained in:
		
							parent
							
								
									8f40229da2
								
							
						
					
					
						commit
						fcab544f17
					
				@ -1,9 +1,9 @@
 | 
				
			|||||||
import concurrent.futures
 | 
					import concurrent.futures
 | 
				
			||||||
import contextlib
 | 
					import contextlib
 | 
				
			||||||
 | 
					import dataclasses
 | 
				
			||||||
import functools
 | 
					import functools
 | 
				
			||||||
import logging
 | 
					import logging
 | 
				
			||||||
import uuid
 | 
					import uuid
 | 
				
			||||||
from dataclasses import dataclass
 | 
					 | 
				
			||||||
from enum import auto
 | 
					from enum import auto
 | 
				
			||||||
from typing import Optional, Union
 | 
					from typing import Optional, Union
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -29,6 +29,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.mxe import (
 | 
				
			|||||||
    MetadataChangeProposal,
 | 
					    MetadataChangeProposal,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from datahub.utilities.advanced_thread_executor import PartitionExecutor
 | 
					from datahub.utilities.advanced_thread_executor import PartitionExecutor
 | 
				
			||||||
 | 
					from datahub.utilities.perf_timer import PerfTimer
 | 
				
			||||||
from datahub.utilities.server_config_util import set_gms_config
 | 
					from datahub.utilities.server_config_util import set_gms_config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
logger = logging.getLogger(__name__)
 | 
					logger = logging.getLogger(__name__)
 | 
				
			||||||
@ -44,15 +45,17 @@ class DatahubRestSinkConfig(DatahubClientConfig):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    # These only apply in async mode.
 | 
					    # These only apply in async mode.
 | 
				
			||||||
    max_threads: int = 15
 | 
					    max_threads: int = 15
 | 
				
			||||||
    max_pending_requests: int = 500
 | 
					    max_pending_requests: int = 2000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@dataclass
 | 
					@dataclasses.dataclass
 | 
				
			||||||
class DataHubRestSinkReport(SinkReport):
 | 
					class DataHubRestSinkReport(SinkReport):
 | 
				
			||||||
    max_threads: int = -1
 | 
					    max_threads: Optional[int] = None
 | 
				
			||||||
    gms_version: str = ""
 | 
					    gms_version: Optional[str] = None
 | 
				
			||||||
    pending_requests: int = 0
 | 
					    pending_requests: int = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    main_thread_blocking_timer: PerfTimer = dataclasses.field(default_factory=PerfTimer)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def compute_stats(self) -> None:
 | 
					    def compute_stats(self) -> None:
 | 
				
			||||||
        super().compute_stats()
 | 
					        super().compute_stats()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -105,7 +108,7 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
 | 
				
			|||||||
        self.report.gms_version = (
 | 
					        self.report.gms_version = (
 | 
				
			||||||
            gms_config.get("versions", {})
 | 
					            gms_config.get("versions", {})
 | 
				
			||||||
            .get("acryldata/datahub", {})
 | 
					            .get("acryldata/datahub", {})
 | 
				
			||||||
            .get("version", "")
 | 
					            .get("version", None)
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        self.report.max_threads = self.config.max_threads
 | 
					        self.report.max_threads = self.config.max_threads
 | 
				
			||||||
        logger.debug("Setting env variables to override config")
 | 
					        logger.debug("Setting env variables to override config")
 | 
				
			||||||
@ -189,25 +192,28 @@ class DatahubRestSink(Sink[DatahubRestSinkConfig, DataHubRestSinkReport]):
 | 
				
			|||||||
        ],
 | 
					        ],
 | 
				
			||||||
        write_callback: WriteCallback,
 | 
					        write_callback: WriteCallback,
 | 
				
			||||||
    ) -> None:
 | 
					    ) -> None:
 | 
				
			||||||
        record = record_envelope.record
 | 
					        # Because the default is async mode and most sources are slower than the sink, this
 | 
				
			||||||
        if self.config.mode == SyncOrAsync.ASYNC:
 | 
					        # should only have a high value if the sink is actually a bottleneck.
 | 
				
			||||||
            partition_key = _get_partition_key(record_envelope)
 | 
					        with self.report.main_thread_blocking_timer:
 | 
				
			||||||
            self.executor.submit(
 | 
					            record = record_envelope.record
 | 
				
			||||||
                partition_key,
 | 
					            if self.config.mode == SyncOrAsync.ASYNC:
 | 
				
			||||||
                self._emit_wrapper,
 | 
					                partition_key = _get_partition_key(record_envelope)
 | 
				
			||||||
                record,
 | 
					                self.executor.submit(
 | 
				
			||||||
                done_callback=functools.partial(
 | 
					                    partition_key,
 | 
				
			||||||
                    self._write_done_callback, record_envelope, write_callback
 | 
					                    self._emit_wrapper,
 | 
				
			||||||
                ),
 | 
					                    record,
 | 
				
			||||||
            )
 | 
					                    done_callback=functools.partial(
 | 
				
			||||||
            self.report.pending_requests += 1
 | 
					                        self._write_done_callback, record_envelope, write_callback
 | 
				
			||||||
        else:
 | 
					                    ),
 | 
				
			||||||
            # execute synchronously
 | 
					                )
 | 
				
			||||||
            try:
 | 
					                self.report.pending_requests += 1
 | 
				
			||||||
                self._emit_wrapper(record)
 | 
					            else:
 | 
				
			||||||
                write_callback.on_success(record_envelope, success_metadata={})
 | 
					                # execute synchronously
 | 
				
			||||||
            except Exception as e:
 | 
					                try:
 | 
				
			||||||
                write_callback.on_failure(record_envelope, e, failure_metadata={})
 | 
					                    self._emit_wrapper(record)
 | 
				
			||||||
 | 
					                    write_callback.on_success(record_envelope, success_metadata={})
 | 
				
			||||||
 | 
					                except Exception as e:
 | 
				
			||||||
 | 
					                    write_callback.on_failure(record_envelope, e, failure_metadata={})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def emit_async(
 | 
					    def emit_async(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user