mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-12-04 19:44:58 +00:00
Fix Metrics Buckets and Url Normalization (#22536)
* Fix Metrics Buckets and Url Normalization * Add more uri to normalization * Maintain Normalized Endpoints * Fix more tags * Fix Tests --------- Co-authored-by: Sriharsha Chintalapani <harshach@users.noreply.github.com>
This commit is contained in:
parent
727ae107f8
commit
94d297d454
@ -0,0 +1,247 @@
|
||||
package org.openmetadata.service.monitoring;
|
||||
|
||||
import java.time.Duration;
|
||||
import lombok.experimental.UtilityClass;
|
||||
|
||||
@UtilityClass
|
||||
public class MetricUtils {
|
||||
// Standard SLA buckets for all latency metrics (10 buckets total)
|
||||
public static final Duration[] LATENCY_SLA_BUCKETS = {
|
||||
Duration.ofMillis(10), // 10ms
|
||||
Duration.ofMillis(25), // 25ms
|
||||
Duration.ofMillis(50), // 50ms
|
||||
Duration.ofMillis(100), // 100ms
|
||||
Duration.ofMillis(250), // 250ms
|
||||
Duration.ofMillis(500), // 500ms
|
||||
Duration.ofSeconds(1), // 1s
|
||||
Duration.ofMillis(2500), // 2.5s
|
||||
Duration.ofSeconds(5), // 5s
|
||||
Duration.ofSeconds(30) // 30s
|
||||
};
|
||||
|
||||
public static String normalizeUri(String uri) {
|
||||
// Normalize URIs to avoid high cardinality
|
||||
if (uri == null || uri.isEmpty()) {
|
||||
return "/unknown";
|
||||
}
|
||||
|
||||
// Remove query parameters to reduce cardinality
|
||||
String normalizedUri = uri.split("\\?")[0];
|
||||
|
||||
// Replace various ID patterns with placeholders
|
||||
normalizedUri =
|
||||
normalizedUri
|
||||
// UUID patterns (e.g., /api/v1/tables/12345678-1234-1234-1234-123456789abc)
|
||||
.replaceAll("/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", "/{id}")
|
||||
// Numeric IDs (e.g., /api/v1/tables/123456)
|
||||
.replaceAll("/\\d+", "/{id}")
|
||||
// Entity names that contain special characters or spaces (encoded)
|
||||
.replaceAll("/[^/]*%[0-9a-fA-F]{2}[^/]*", "/{name}")
|
||||
// Long alphanumeric strings that might be encoded names
|
||||
.replaceAll("/[a-zA-Z0-9_.-]{20,}", "/{name}")
|
||||
// Handle common OpenMetadata API patterns - split into multiple patterns to reduce
|
||||
// complexity
|
||||
.replaceAll(
|
||||
"/(tables|databases|services|pipelines|topics|dashboards|charts|containers)/[^/]+/[^/]+",
|
||||
"/$1/{name}/{subresource}")
|
||||
.replaceAll(
|
||||
"/(glossaryTerms|tags|policies|roles|users|teams|dataModels|searchIndexes)/[^/]+/[^/]+",
|
||||
"/$1/{name}/{subresource}")
|
||||
.replaceAll(
|
||||
"/(testSuites|testCases|webhooks|bots|automations|applications|connections)/[^/]+/[^/]+",
|
||||
"/$1/{name}/{subresource}")
|
||||
.replaceAll(
|
||||
"/(secrets|storedProcedures|databaseSchemas|mlModels|reports|metrics)/[^/]+/[^/]+",
|
||||
"/$1/{name}/{subresource}")
|
||||
.replaceAll(
|
||||
"/(queries|suggestions|lineage|events|feeds|conversations|activities)/[^/]+/[^/]+",
|
||||
"/$1/{name}/{subresource}")
|
||||
.replaceAll(
|
||||
"/(tasks|kpis|domains|dataProducts|governanceWorkflows)/[^/]+/[^/]+",
|
||||
"/$1/{name}/{subresource}")
|
||||
.replaceAll(
|
||||
"/(tables|databases|services|pipelines|topics|dashboards|charts|containers)/[^/]+",
|
||||
"/$1/{name}")
|
||||
.replaceAll(
|
||||
"/(glossaryTerms|tags|policies|roles|users|teams|dataModels|searchIndexes)/[^/]+",
|
||||
"/$1/{name}")
|
||||
.replaceAll(
|
||||
"/(testSuites|testCases|webhooks|bots|automations|applications|connections)/[^/]+",
|
||||
"/$1/{name}")
|
||||
.replaceAll(
|
||||
"/(secrets|storedProcedures|databaseSchemas|mlModels|reports|metrics)/[^/]+",
|
||||
"/$1/{name}")
|
||||
.replaceAll(
|
||||
"/(queries|suggestions|lineage|events|feeds|conversations|activities)/[^/]+",
|
||||
"/$1/{name}")
|
||||
.replaceAll(
|
||||
"/(tasks|kpis|domains|dataProducts|governanceWorkflows)/[^/]+", "/$1/{name}")
|
||||
// Analytics deep paths with timestamps and multiple segments
|
||||
.replaceAll(
|
||||
"/analytics/dataInsights/[^/]+/[^/]+", "/analytics/dataInsights/{type}/{id}")
|
||||
.replaceAll(
|
||||
"/analytics/web/events/[^/]+/[^/]+/collect",
|
||||
"/analytics/web/events/{name}/{timestamp}/collect")
|
||||
// Data quality multi-level paths
|
||||
.replaceAll("/dataQuality/testCases/[^/]+/[^/]+", "/dataQuality/testCases/{type}/{id}")
|
||||
.replaceAll(
|
||||
"/dataQuality/testSuites/[^/]+/[^/]+", "/dataQuality/testSuites/{id}/{subresource}")
|
||||
// Complex lineage patterns with multiple entities
|
||||
.replaceAll(
|
||||
"/lineage/[^/]+/[^/]+/[^/]+/[^/]+",
|
||||
"/lineage/{fromEntity}/{fromId}/{toEntity}/{toId}")
|
||||
.replaceAll(
|
||||
"/lineage/[^/]+/name/[^/]+/[^/]+/name/[^/]+",
|
||||
"/lineage/{fromEntity}/name/{fromFQN}/{toEntity}/name/{toFQN}")
|
||||
.replaceAll(
|
||||
"/lineage/[^/]+/[^/]+/type/[^/]+",
|
||||
"/lineage/{entityType}/{entityId}/type/{lineageSource}")
|
||||
// Event subscriptions complex paths
|
||||
.replaceAll(
|
||||
"/events/subscriptions/[^/]+/[^/]+/[^/]+",
|
||||
"/events/subscriptions/{id}/{resource}/{subresource}")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/name/[^/]+/[^/]+",
|
||||
"/events/subscriptions/name/{name}/{resource}")
|
||||
// Service nested paths
|
||||
.replaceAll("/services/[^/]+/[^/]+/[^/]+", "/services/{serviceType}/{id}/{subresource}")
|
||||
.replaceAll(
|
||||
"/services/testConnectionDefinitions/[^/]+",
|
||||
"/services/testConnectionDefinitions/{connectionType}")
|
||||
// Governance workflow paths
|
||||
.replaceAll(
|
||||
"/governance/[^/]+/[^/]+/[^/]+",
|
||||
"/governance/{workflowType}/{definitionName}/{instanceId}")
|
||||
// Drive/file management paths
|
||||
.replaceAll("/drives/[^/]+/[^/]+/[^/]+", "/drives/{type}/{id}/{subresource}")
|
||||
// Universal entity sub-resources (versions, followers, results, etc.)
|
||||
.replaceAll("/([^/]+)/([^/]+)/versions/[^/]+", "/$1/$2/versions/{version}")
|
||||
.replaceAll("/([^/]+)/([^/]+)/followers/[^/]+", "/$1/$2/followers/{userId}")
|
||||
.replaceAll("/([^/]+)/([^/]+)/results/[^/]+", "/$1/$2/results/{result}")
|
||||
.replaceAll(
|
||||
"/([^/]+)/([^/]+)/results/before/[^/]+", "/$1/$2/results/before/{timestamp}")
|
||||
.replaceAll(
|
||||
"/([^/]+)/name/([^/]+)/(export|import|exportAsync|importAsync)", "/$1/name/$2/$3")
|
||||
// SCIM paths
|
||||
.replaceAll("/scim/(Users|Groups)/[^/]+", "/scim/$1/{id}")
|
||||
// Permission resource patterns
|
||||
.replaceAll("/permissions/[^/]+/[^/]+", "/permissions/{resource}/{id}")
|
||||
.replaceAll("/permissions/[^/]+/name/[^/]+", "/permissions/{resource}/name/{name}")
|
||||
.replaceAll("/permissions/view/[^/]+", "/permissions/view/{entityType}")
|
||||
.replaceAll("/permissions/debug/user/[^/]+", "/permissions/debug/user/{username}")
|
||||
.replaceAll("/permissions/debug/evaluate", "/permissions/debug/evaluate")
|
||||
.replaceAll("/permissions/[^/]+", "/permissions/{resource}")
|
||||
// EventSubscription complex patterns (HIGH PRIORITY - prevents cardinality explosion)
|
||||
.replaceAll(
|
||||
"/events/subscriptions/name/[^/]+/status/[^/]+",
|
||||
"/events/subscriptions/name/{name}/status/{destinationId}")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/[^/]+/status/[^/]+",
|
||||
"/events/subscriptions/{id}/status/{destinationId}")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/[^/]+/resources",
|
||||
"/events/subscriptions/{alertType}/resources")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/id/[^/]+/listEvents",
|
||||
"/events/subscriptions/id/{id}/listEvents")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/id/[^/]+/eventsRecord",
|
||||
"/events/subscriptions/id/{subscriptionId}/eventsRecord")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/name/[^/]+/eventsRecord",
|
||||
"/events/subscriptions/name/{subscriptionName}/eventsRecord")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/id/[^/]+/diagnosticInfo",
|
||||
"/events/subscriptions/id/{subscriptionId}/diagnosticInfo")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/name/[^/]+/diagnosticInfo",
|
||||
"/events/subscriptions/name/{subscriptionName}/diagnosticInfo")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/id/[^/]+/failedEvents",
|
||||
"/events/subscriptions/id/{id}/failedEvents")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/name/[^/]+/failedEvents",
|
||||
"/events/subscriptions/name/{eventSubscriptionName}/failedEvents")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/id/[^/]+/listSuccessfullySentChangeEvents",
|
||||
"/events/subscriptions/id/{id}/listSuccessfullySentChangeEvents")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/name/[^/]+/listSuccessfullySentChangeEvents",
|
||||
"/events/subscriptions/name/{eventSubscriptionName}/listSuccessfullySentChangeEvents")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/id/[^/]+/destinations",
|
||||
"/events/subscriptions/id/{eventSubscriptionId}/destinations")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/name/[^/]+/destinations",
|
||||
"/events/subscriptions/name/{eventSubscriptionName}/destinations")
|
||||
.replaceAll(
|
||||
"/events/subscriptions/name/[^/]+/syncOffset",
|
||||
"/events/subscriptions/name/{eventSubscriptionName}/syncOffset")
|
||||
// App management patterns
|
||||
.replaceAll("/apps/name/[^/]+/status", "/apps/name/{name}/status")
|
||||
.replaceAll("/apps/name/[^/]+/extension", "/apps/name/{name}/extension")
|
||||
.replaceAll("/apps/name/[^/]+/logs", "/apps/name/{name}/logs")
|
||||
.replaceAll("/apps/name/[^/]+/runs/latest", "/apps/name/{name}/runs/latest")
|
||||
.replaceAll("/apps/schedule/[^/]+", "/apps/schedule/{name}")
|
||||
.replaceAll("/apps/configure/[^/]+", "/apps/configure/{name}")
|
||||
.replaceAll("/apps/trigger/[^/]+", "/apps/trigger/{name}")
|
||||
.replaceAll("/apps/stop/[^/]+", "/apps/stop/{name}")
|
||||
.replaceAll("/apps/deploy/[^/]+", "/apps/deploy/{name}")
|
||||
// IngestionPipeline operational patterns
|
||||
.replaceAll(
|
||||
"/services/ingestionPipelines/deploy/[^/]+",
|
||||
"/services/ingestionPipelines/deploy/{id}")
|
||||
.replaceAll(
|
||||
"/services/ingestionPipelines/trigger/[^/]+",
|
||||
"/services/ingestionPipelines/trigger/{id}")
|
||||
.replaceAll(
|
||||
"/services/ingestionPipelines/toggleIngestion/[^/]+",
|
||||
"/services/ingestionPipelines/toggleIngestion/{id}")
|
||||
.replaceAll(
|
||||
"/services/ingestionPipelines/kill/[^/]+", "/services/ingestionPipelines/kill/{id}")
|
||||
.replaceAll(
|
||||
"/services/ingestionPipelines/logs/[^/]+/last",
|
||||
"/services/ingestionPipelines/logs/{id}/last")
|
||||
.replaceAll(
|
||||
"/services/ingestionPipelines/[^/]+/pipelineStatus/[^/]+",
|
||||
"/services/ingestionPipelines/{fqn}/pipelineStatus/{id}")
|
||||
.replaceAll(
|
||||
"/services/ingestionPipelines/[^/]+/pipelineStatus",
|
||||
"/services/ingestionPipelines/{fqn}/pipelineStatus")
|
||||
// Search resource patterns
|
||||
.replaceAll("/search/get/[^/]+/doc/[^/]+", "/search/get/{index}/doc/{id}")
|
||||
// User authentication & security patterns
|
||||
.replaceAll("/users/generateToken/[^/]+", "/users/generateToken/{id}")
|
||||
.replaceAll("/users/token/[^/]+", "/users/token/{id}")
|
||||
.replaceAll("/users/auth-mechanism/[^/]+", "/users/auth-mechanism/{id}")
|
||||
// Feed & discussion patterns
|
||||
.replaceAll("/feed/tasks/[^/]+/resolve", "/feed/tasks/{id}/resolve")
|
||||
.replaceAll("/feed/tasks/[^/]+/close", "/feed/tasks/{id}/close")
|
||||
.replaceAll("/feed/tasks/[^/]+", "/feed/tasks/{id}")
|
||||
.replaceAll("/feed/[^/]+/posts/[^/]+", "/feed/{threadId}/posts/{postId}")
|
||||
.replaceAll("/feed/[^/]+/posts", "/feed/{id}/posts")
|
||||
.replaceAll("/feed/[^/]+", "/feed/{threadId}")
|
||||
// System & configuration patterns
|
||||
.replaceAll("/system/settings/[^/]+", "/system/settings/{name}")
|
||||
.replaceAll("/system/settings/reset/[^/]+", "/system/settings/reset/{name}")
|
||||
// DocStore patterns
|
||||
.replaceAll(
|
||||
"/docStore/validateTemplate/[^/]+", "/docStore/validateTemplate/{templateName}")
|
||||
// Handle remaining timestamp patterns
|
||||
.replaceAll("/[0-9]{10,13}", "/{timestamp}");
|
||||
|
||||
// Ensure we don't have empty path segments
|
||||
normalizedUri = normalizedUri.replaceAll("/+", "/");
|
||||
|
||||
// Limit to reasonable URI length to prevent edge cases
|
||||
if (normalizedUri.length() > 100) {
|
||||
// For very long URIs, just use the first few path segments
|
||||
String[] segments = normalizedUri.split("/");
|
||||
if (segments.length > 5) {
|
||||
normalizedUri = String.join("/", java.util.Arrays.copyOfRange(segments, 0, 5)) + "/...";
|
||||
}
|
||||
}
|
||||
|
||||
return normalizedUri.isEmpty() ? "/" : normalizedUri;
|
||||
}
|
||||
}
|
||||
@ -138,8 +138,7 @@ public class MicrometerBundle implements ConfiguredBundle<OpenMetadataApplicatio
|
||||
// HTTP request histogram
|
||||
io.micrometer.core.instrument.Timer.builder("http_server_requests_sec")
|
||||
.description("HTTP methods duration")
|
||||
.publishPercentileHistogram()
|
||||
.serviceLevelObjectives(
|
||||
.sla(
|
||||
java.time.Duration.ofMillis(10),
|
||||
java.time.Duration.ofMillis(100),
|
||||
java.time.Duration.ofSeconds(1),
|
||||
@ -153,8 +152,7 @@ public class MicrometerBundle implements ConfiguredBundle<OpenMetadataApplicatio
|
||||
// JDBI request histogram
|
||||
io.micrometer.core.instrument.Timer.builder("jdbi_requests_seconds")
|
||||
.description("jdbi requests duration distribution")
|
||||
.publishPercentileHistogram()
|
||||
.serviceLevelObjectives(
|
||||
.sla(
|
||||
java.time.Duration.ofMillis(10),
|
||||
java.time.Duration.ofMillis(100),
|
||||
java.time.Duration.ofSeconds(1),
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
package org.openmetadata.service.monitoring;
|
||||
|
||||
import static org.openmetadata.service.monitoring.MetricUtils.LATENCY_SLA_BUCKETS;
|
||||
import static org.openmetadata.service.monitoring.MetricUtils.normalizeUri;
|
||||
|
||||
import io.micrometer.core.instrument.Counter;
|
||||
import io.micrometer.core.instrument.DistributionSummary;
|
||||
import io.micrometer.core.instrument.MeterRegistry;
|
||||
@ -51,12 +54,7 @@ public class OpenMetadataMetrics {
|
||||
this.httpRequestTimer =
|
||||
Timer.builder("http.server.requests")
|
||||
.description("HTTP server request duration")
|
||||
.publishPercentileHistogram()
|
||||
.serviceLevelObjectives(
|
||||
Duration.ofMillis(50),
|
||||
Duration.ofMillis(100),
|
||||
Duration.ofMillis(500),
|
||||
Duration.ofSeconds(1))
|
||||
.sla(LATENCY_SLA_BUCKETS)
|
||||
.register(meterRegistry);
|
||||
|
||||
this.httpRequestCounter =
|
||||
@ -68,16 +66,14 @@ public class OpenMetadataMetrics {
|
||||
DistributionSummary.builder("http.server.response.size")
|
||||
.description("HTTP response size in bytes")
|
||||
.baseUnit("bytes")
|
||||
.publishPercentileHistogram()
|
||||
.serviceLevelObjectives(1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216)
|
||||
.register(meterRegistry);
|
||||
|
||||
// Initialize Database metrics
|
||||
this.jdbiQueryTimer =
|
||||
Timer.builder("db.query.duration")
|
||||
.description("Database query duration")
|
||||
.publishPercentileHistogram()
|
||||
.serviceLevelObjectives(
|
||||
Duration.ofMillis(10), Duration.ofMillis(50), Duration.ofMillis(100))
|
||||
.sla(LATENCY_SLA_BUCKETS)
|
||||
.register(meterRegistry);
|
||||
|
||||
this.jdbiConnectionCounter =
|
||||
@ -123,7 +119,7 @@ public class OpenMetadataMetrics {
|
||||
this.pipelineExecutionTimer =
|
||||
Timer.builder("pipeline.execution.duration")
|
||||
.description("Pipeline execution duration")
|
||||
.publishPercentileHistogram()
|
||||
.sla(LATENCY_SLA_BUCKETS)
|
||||
.register(meterRegistry);
|
||||
|
||||
// Initialize Authentication metrics
|
||||
@ -265,14 +261,6 @@ public class OpenMetadataMetrics {
|
||||
.register(meterRegistry);
|
||||
}
|
||||
|
||||
// Utility methods
|
||||
private String normalizeUri(String uri) {
|
||||
// Normalize URIs to avoid high cardinality
|
||||
// Replace IDs with placeholders
|
||||
return uri.replaceAll("/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", "/{id}")
|
||||
.replaceAll("/[0-9]+", "/{id}");
|
||||
}
|
||||
|
||||
private String getStatusClass(int status) {
|
||||
if (status >= 100 && status < 200) return "1xx";
|
||||
if (status >= 200 && status < 300) return "2xx";
|
||||
|
||||
@ -1,5 +1,8 @@
|
||||
package org.openmetadata.service.monitoring;
|
||||
|
||||
import static org.openmetadata.service.monitoring.MetricUtils.LATENCY_SLA_BUCKETS;
|
||||
import static org.openmetadata.service.monitoring.MetricUtils.normalizeUri;
|
||||
|
||||
import io.micrometer.core.instrument.Gauge;
|
||||
import io.micrometer.core.instrument.Metrics;
|
||||
import io.micrometer.core.instrument.Timer;
|
||||
@ -48,14 +51,14 @@ public class RequestLatencyContext {
|
||||
public static void startRequest(String endpoint) {
|
||||
RequestContext context = new RequestContext(endpoint);
|
||||
requestContext.set(context);
|
||||
|
||||
String normalizedEndpoint = normalizeUri(endpoint);
|
||||
requestTimers.computeIfAbsent(
|
||||
endpoint,
|
||||
normalizedEndpoint,
|
||||
k ->
|
||||
Timer.builder("request.latency.total")
|
||||
.tag(ENDPOINT, endpoint)
|
||||
.tag(ENDPOINT, normalizedEndpoint)
|
||||
.description("Total request latency")
|
||||
.publishPercentileHistogram()
|
||||
.sla(LATENCY_SLA_BUCKETS)
|
||||
.register(Metrics.globalRegistry));
|
||||
context.requestTimerSample = Timer.start(Metrics.globalRegistry);
|
||||
context.internalTimerStartNanos = System.nanoTime();
|
||||
@ -128,10 +131,11 @@ public class RequestLatencyContext {
|
||||
RequestContext context = requestContext.get();
|
||||
if (context == null) return;
|
||||
|
||||
String normalizedEndpoint = normalizeUri(context.endpoint);
|
||||
try {
|
||||
// Stop request timer
|
||||
if (context.requestTimerSample != null) {
|
||||
Timer requestTimer = requestTimers.get(context.endpoint);
|
||||
Timer requestTimer = requestTimers.get(normalizedEndpoint);
|
||||
if (requestTimer != null) {
|
||||
context.totalTime = context.requestTimerSample.stop(requestTimer);
|
||||
}
|
||||
@ -145,47 +149,47 @@ public class RequestLatencyContext {
|
||||
// This gives us the total DB time for THIS request
|
||||
Timer dbTimer =
|
||||
databaseTimers.computeIfAbsent(
|
||||
context.endpoint,
|
||||
normalizedEndpoint,
|
||||
k ->
|
||||
Timer.builder("request.latency.database")
|
||||
.tag(ENDPOINT, context.endpoint)
|
||||
.tag(ENDPOINT, normalizedEndpoint)
|
||||
.description("Total database latency per request")
|
||||
.publishPercentileHistogram()
|
||||
.sla(LATENCY_SLA_BUCKETS)
|
||||
.register(Metrics.globalRegistry));
|
||||
dbTimer.record(context.dbTime, java.util.concurrent.TimeUnit.NANOSECONDS);
|
||||
|
||||
// Record total search time for THIS request
|
||||
Timer searchTimer =
|
||||
searchTimers.computeIfAbsent(
|
||||
context.endpoint,
|
||||
normalizedEndpoint,
|
||||
k ->
|
||||
Timer.builder("request.latency.search")
|
||||
.tag(ENDPOINT, context.endpoint)
|
||||
.tag(ENDPOINT, normalizedEndpoint)
|
||||
.description("Total search latency per request")
|
||||
.publishPercentileHistogram()
|
||||
.sla(LATENCY_SLA_BUCKETS)
|
||||
.register(Metrics.globalRegistry));
|
||||
searchTimer.record(context.searchTime, java.util.concurrent.TimeUnit.NANOSECONDS);
|
||||
|
||||
// Record internal processing time for THIS request
|
||||
Timer internalTimer =
|
||||
internalTimers.computeIfAbsent(
|
||||
context.endpoint,
|
||||
normalizedEndpoint,
|
||||
k ->
|
||||
Timer.builder("request.latency.internal")
|
||||
.tag(ENDPOINT, context.endpoint)
|
||||
.tag(ENDPOINT, normalizedEndpoint)
|
||||
.description("Internal processing latency per request")
|
||||
.publishPercentileHistogram()
|
||||
.sla(LATENCY_SLA_BUCKETS)
|
||||
.register(Metrics.globalRegistry));
|
||||
internalTimer.record(context.internalTime, java.util.concurrent.TimeUnit.NANOSECONDS);
|
||||
|
||||
// Record operation counts as distribution summaries to get avg/max/percentiles
|
||||
if (context.dbOperationCount > 0) {
|
||||
Metrics.summary("request.operations.database", ENDPOINT, context.endpoint)
|
||||
Metrics.summary("request.operations.database", ENDPOINT, normalizedEndpoint)
|
||||
.record(context.dbOperationCount);
|
||||
}
|
||||
|
||||
if (context.searchOperationCount > 0) {
|
||||
Metrics.summary("request.operations.search", ENDPOINT, context.endpoint)
|
||||
Metrics.summary("request.operations.search", ENDPOINT, normalizedEndpoint)
|
||||
.record(context.searchOperationCount);
|
||||
}
|
||||
|
||||
@ -198,23 +202,23 @@ public class RequestLatencyContext {
|
||||
// Get or create percentage holder for this endpoint
|
||||
PercentageHolder holder =
|
||||
percentageHolders.computeIfAbsent(
|
||||
context.endpoint,
|
||||
normalizedEndpoint,
|
||||
k -> {
|
||||
PercentageHolder newHolder = new PercentageHolder();
|
||||
|
||||
// Register gauges that read from the atomic references
|
||||
Gauge.builder("request.percentage.database", newHolder.databasePercent::get)
|
||||
.tag(ENDPOINT, context.endpoint)
|
||||
.tag(ENDPOINT, normalizedEndpoint)
|
||||
.description("Percentage of request time spent in database operations")
|
||||
.register(Metrics.globalRegistry);
|
||||
|
||||
Gauge.builder("request.percentage.search", newHolder.searchPercent::get)
|
||||
.tag(ENDPOINT, context.endpoint)
|
||||
.tag(ENDPOINT, normalizedEndpoint)
|
||||
.description("Percentage of request time spent in search operations")
|
||||
.register(Metrics.globalRegistry);
|
||||
|
||||
Gauge.builder("request.percentage.internal", newHolder.internalPercent::get)
|
||||
.tag(ENDPOINT, context.endpoint)
|
||||
.tag(ENDPOINT, normalizedEndpoint)
|
||||
.description("Percentage of request time spent in internal processing")
|
||||
.register(Metrics.globalRegistry);
|
||||
|
||||
|
||||
@ -39,10 +39,11 @@ class RequestLatencyContextTest {
|
||||
|
||||
simulateWork(20);
|
||||
|
||||
String normalizedEndpoint = MetricUtils.normalizeUri(endpoint);
|
||||
RequestLatencyContext.endRequest();
|
||||
Timer totalTimer = Metrics.timer("request.latency.total", "endpoint", endpoint);
|
||||
Timer dbTimer = Metrics.timer("request.latency.database", "endpoint", endpoint);
|
||||
Timer internalTimer = Metrics.timer("request.latency.internal", "endpoint", endpoint);
|
||||
Timer totalTimer = Metrics.timer("request.latency.total", "endpoint", normalizedEndpoint);
|
||||
Timer dbTimer = Metrics.timer("request.latency.database", "endpoint", normalizedEndpoint);
|
||||
Timer internalTimer = Metrics.timer("request.latency.internal", "endpoint", normalizedEndpoint);
|
||||
|
||||
assertEquals(1, totalTimer.count(), "Should have recorded 1 request");
|
||||
assertEquals(1, dbTimer.count(), "Should have recorded 1 request with database operations");
|
||||
|
||||
@ -34,15 +34,16 @@ class RequestLatencyTrackingSimpleTest {
|
||||
simulateWork(30);
|
||||
RequestLatencyContext.endRequest();
|
||||
|
||||
Timer totalTimer = Metrics.timer("request.latency.total", "endpoint", endpoint);
|
||||
String normalizedEndpoint = MetricUtils.normalizeUri(endpoint);
|
||||
Timer totalTimer = Metrics.timer("request.latency.total", "endpoint", normalizedEndpoint);
|
||||
assertNotNull(totalTimer);
|
||||
assertEquals(1, totalTimer.count(), "Should have recorded 1 request");
|
||||
|
||||
Timer dbTimer = Metrics.timer("request.latency.database", "endpoint", endpoint);
|
||||
Timer dbTimer = Metrics.timer("request.latency.database", "endpoint", normalizedEndpoint);
|
||||
assertNotNull(dbTimer);
|
||||
assertEquals(1, dbTimer.count(), "Should have recorded 1 database operation");
|
||||
|
||||
Timer internalTimer = Metrics.timer("request.latency.internal", "endpoint", endpoint);
|
||||
Timer internalTimer = Metrics.timer("request.latency.internal", "endpoint", normalizedEndpoint);
|
||||
assertNotNull(internalTimer);
|
||||
assertEquals(1, internalTimer.count(), "Should have recorded internal processing");
|
||||
|
||||
@ -54,9 +55,14 @@ class RequestLatencyTrackingSimpleTest {
|
||||
LOG.info("Database time: {} ms", dbMs);
|
||||
LOG.info("Internal time: {} ms", internalMs);
|
||||
|
||||
assertTrue(totalMs >= 150 && totalMs <= 210, "Total time should be ~180ms, got: " + totalMs);
|
||||
assertTrue(dbMs >= 80 && dbMs <= 120, "Database time should be ~100ms, got: " + dbMs);
|
||||
// Timing expectations: 500ms + 100ms + 30ms = 630ms total
|
||||
// DB time: 100ms during database operation
|
||||
// Internal time: 500ms (before DB) + 30ms (after DB) = 530ms
|
||||
// Allow generous bounds for system timing variations
|
||||
assertTrue(totalMs >= 500 && totalMs <= 1000, "Total time should be ~630ms, got: " + totalMs);
|
||||
assertTrue(dbMs >= 80 && dbMs <= 150, "Database time should be ~100ms, got: " + dbMs);
|
||||
assertTrue(
|
||||
internalMs >= 60 && internalMs <= 100, "Internal time should be ~80ms, got: " + internalMs);
|
||||
internalMs >= 400 && internalMs <= 700,
|
||||
"Internal time should be ~530ms, got: " + internalMs);
|
||||
}
|
||||
}
|
||||
|
||||
@ -123,10 +123,11 @@ class RequestLatencyTrackingTest extends OpenMetadataApplicationTest {
|
||||
// Check for specific endpoint metrics - the actual metrics use the endpoint path as-is
|
||||
LOG.info("Looking for metrics with endpoint: {}", getEndpoint);
|
||||
|
||||
String normalizedUri = MetricUtils.normalizeUri("v1/tables/" + createdTable.getId());
|
||||
// Parse and verify latency metrics
|
||||
assertLatencyMetricsExist(prometheusMetrics, "request_latency_total", getEndpoint);
|
||||
assertLatencyMetricsExist(prometheusMetrics, "request_latency_database", getEndpoint);
|
||||
assertLatencyMetricsExist(prometheusMetrics, "request_latency_internal", getEndpoint);
|
||||
assertLatencyMetricsExist(prometheusMetrics, "request_latency_total", normalizedUri);
|
||||
assertLatencyMetricsExist(prometheusMetrics, "request_latency_database", normalizedUri);
|
||||
assertLatencyMetricsExist(prometheusMetrics, "request_latency_internal", normalizedUri);
|
||||
}
|
||||
|
||||
@Test
|
||||
@ -205,7 +206,7 @@ class RequestLatencyTrackingTest extends OpenMetadataApplicationTest {
|
||||
|
||||
WebTarget complexTarget =
|
||||
getResource("tables/" + createdTable.getId())
|
||||
.queryParam("fields", "owners,tags,followers,columns,domain,dataProducts,extension")
|
||||
.queryParam("fields", "owners,tags,followers,columns,domains,dataProducts,extension")
|
||||
.queryParam("include", "all");
|
||||
TestUtils.get(complexTarget, Table.class, ADMIN_AUTH_HEADERS);
|
||||
|
||||
@ -218,7 +219,8 @@ class RequestLatencyTrackingTest extends OpenMetadataApplicationTest {
|
||||
String endpoint = "v1/tables/" + createdTable.getId();
|
||||
|
||||
// Verify database operation count
|
||||
assertLatencyMetricsExist(prometheusMetrics, "request_latency_database", endpoint);
|
||||
assertLatencyMetricsExist(
|
||||
prometheusMetrics, "request_latency_database", MetricUtils.normalizeUri(endpoint));
|
||||
|
||||
// Check that we have multiple database operations recorded
|
||||
assertTrue(
|
||||
@ -247,7 +249,9 @@ class RequestLatencyTrackingTest extends OpenMetadataApplicationTest {
|
||||
private void assertLatencyMetricsExist(
|
||||
String prometheusOutput, String metricName, String endpoint) {
|
||||
// Look for metrics that contain the metric name with the endpoint label
|
||||
String pattern = metricName + "_seconds.*endpoint=\"" + endpoint.replace("/", "\\/") + "\"";
|
||||
// Escape regex special characters in the endpoint string
|
||||
String escapedEndpoint = java.util.regex.Pattern.quote(endpoint);
|
||||
String pattern = metricName + "_seconds.*endpoint=\"" + escapedEndpoint + "\"";
|
||||
assertTrue(
|
||||
prometheusOutput.matches("(?s).*" + pattern + ".*"),
|
||||
String.format(
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user