mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-12-05 12:07:16 +00:00
Fix Metrics Buckets and Url Normalization (#22536)
* Fix Metrics Buckets and Url Normalization * Add more uri to normalization * Maintain Normalized Endpoints * Fix more tags * Fix Tests --------- Co-authored-by: Sriharsha Chintalapani <harshach@users.noreply.github.com>
This commit is contained in:
parent
727ae107f8
commit
94d297d454
@ -0,0 +1,247 @@
|
|||||||
|
package org.openmetadata.service.monitoring;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
|
import lombok.experimental.UtilityClass;
|
||||||
|
|
||||||
|
@UtilityClass
|
||||||
|
public class MetricUtils {
|
||||||
|
// Standard SLA buckets for all latency metrics (10 buckets total)
|
||||||
|
public static final Duration[] LATENCY_SLA_BUCKETS = {
|
||||||
|
Duration.ofMillis(10), // 10ms
|
||||||
|
Duration.ofMillis(25), // 25ms
|
||||||
|
Duration.ofMillis(50), // 50ms
|
||||||
|
Duration.ofMillis(100), // 100ms
|
||||||
|
Duration.ofMillis(250), // 250ms
|
||||||
|
Duration.ofMillis(500), // 500ms
|
||||||
|
Duration.ofSeconds(1), // 1s
|
||||||
|
Duration.ofMillis(2500), // 2.5s
|
||||||
|
Duration.ofSeconds(5), // 5s
|
||||||
|
Duration.ofSeconds(30) // 30s
|
||||||
|
};
|
||||||
|
|
||||||
|
public static String normalizeUri(String uri) {
|
||||||
|
// Normalize URIs to avoid high cardinality
|
||||||
|
if (uri == null || uri.isEmpty()) {
|
||||||
|
return "/unknown";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Remove query parameters to reduce cardinality
|
||||||
|
String normalizedUri = uri.split("\\?")[0];
|
||||||
|
|
||||||
|
// Replace various ID patterns with placeholders
|
||||||
|
normalizedUri =
|
||||||
|
normalizedUri
|
||||||
|
// UUID patterns (e.g., /api/v1/tables/12345678-1234-1234-1234-123456789abc)
|
||||||
|
.replaceAll("/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", "/{id}")
|
||||||
|
// Numeric IDs (e.g., /api/v1/tables/123456)
|
||||||
|
.replaceAll("/\\d+", "/{id}")
|
||||||
|
// Entity names that contain special characters or spaces (encoded)
|
||||||
|
.replaceAll("/[^/]*%[0-9a-fA-F]{2}[^/]*", "/{name}")
|
||||||
|
// Long alphanumeric strings that might be encoded names
|
||||||
|
.replaceAll("/[a-zA-Z0-9_.-]{20,}", "/{name}")
|
||||||
|
// Handle common OpenMetadata API patterns - split into multiple patterns to reduce
|
||||||
|
// complexity
|
||||||
|
.replaceAll(
|
||||||
|
"/(tables|databases|services|pipelines|topics|dashboards|charts|containers)/[^/]+/[^/]+",
|
||||||
|
"/$1/{name}/{subresource}")
|
||||||
|
.replaceAll(
|
||||||
|
"/(glossaryTerms|tags|policies|roles|users|teams|dataModels|searchIndexes)/[^/]+/[^/]+",
|
||||||
|
"/$1/{name}/{subresource}")
|
||||||
|
.replaceAll(
|
||||||
|
"/(testSuites|testCases|webhooks|bots|automations|applications|connections)/[^/]+/[^/]+",
|
||||||
|
"/$1/{name}/{subresource}")
|
||||||
|
.replaceAll(
|
||||||
|
"/(secrets|storedProcedures|databaseSchemas|mlModels|reports|metrics)/[^/]+/[^/]+",
|
||||||
|
"/$1/{name}/{subresource}")
|
||||||
|
.replaceAll(
|
||||||
|
"/(queries|suggestions|lineage|events|feeds|conversations|activities)/[^/]+/[^/]+",
|
||||||
|
"/$1/{name}/{subresource}")
|
||||||
|
.replaceAll(
|
||||||
|
"/(tasks|kpis|domains|dataProducts|governanceWorkflows)/[^/]+/[^/]+",
|
||||||
|
"/$1/{name}/{subresource}")
|
||||||
|
.replaceAll(
|
||||||
|
"/(tables|databases|services|pipelines|topics|dashboards|charts|containers)/[^/]+",
|
||||||
|
"/$1/{name}")
|
||||||
|
.replaceAll(
|
||||||
|
"/(glossaryTerms|tags|policies|roles|users|teams|dataModels|searchIndexes)/[^/]+",
|
||||||
|
"/$1/{name}")
|
||||||
|
.replaceAll(
|
||||||
|
"/(testSuites|testCases|webhooks|bots|automations|applications|connections)/[^/]+",
|
||||||
|
"/$1/{name}")
|
||||||
|
.replaceAll(
|
||||||
|
"/(secrets|storedProcedures|databaseSchemas|mlModels|reports|metrics)/[^/]+",
|
||||||
|
"/$1/{name}")
|
||||||
|
.replaceAll(
|
||||||
|
"/(queries|suggestions|lineage|events|feeds|conversations|activities)/[^/]+",
|
||||||
|
"/$1/{name}")
|
||||||
|
.replaceAll(
|
||||||
|
"/(tasks|kpis|domains|dataProducts|governanceWorkflows)/[^/]+", "/$1/{name}")
|
||||||
|
// Analytics deep paths with timestamps and multiple segments
|
||||||
|
.replaceAll(
|
||||||
|
"/analytics/dataInsights/[^/]+/[^/]+", "/analytics/dataInsights/{type}/{id}")
|
||||||
|
.replaceAll(
|
||||||
|
"/analytics/web/events/[^/]+/[^/]+/collect",
|
||||||
|
"/analytics/web/events/{name}/{timestamp}/collect")
|
||||||
|
// Data quality multi-level paths
|
||||||
|
.replaceAll("/dataQuality/testCases/[^/]+/[^/]+", "/dataQuality/testCases/{type}/{id}")
|
||||||
|
.replaceAll(
|
||||||
|
"/dataQuality/testSuites/[^/]+/[^/]+", "/dataQuality/testSuites/{id}/{subresource}")
|
||||||
|
// Complex lineage patterns with multiple entities
|
||||||
|
.replaceAll(
|
||||||
|
"/lineage/[^/]+/[^/]+/[^/]+/[^/]+",
|
||||||
|
"/lineage/{fromEntity}/{fromId}/{toEntity}/{toId}")
|
||||||
|
.replaceAll(
|
||||||
|
"/lineage/[^/]+/name/[^/]+/[^/]+/name/[^/]+",
|
||||||
|
"/lineage/{fromEntity}/name/{fromFQN}/{toEntity}/name/{toFQN}")
|
||||||
|
.replaceAll(
|
||||||
|
"/lineage/[^/]+/[^/]+/type/[^/]+",
|
||||||
|
"/lineage/{entityType}/{entityId}/type/{lineageSource}")
|
||||||
|
// Event subscriptions complex paths
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/[^/]+/[^/]+/[^/]+",
|
||||||
|
"/events/subscriptions/{id}/{resource}/{subresource}")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/name/[^/]+/[^/]+",
|
||||||
|
"/events/subscriptions/name/{name}/{resource}")
|
||||||
|
// Service nested paths
|
||||||
|
.replaceAll("/services/[^/]+/[^/]+/[^/]+", "/services/{serviceType}/{id}/{subresource}")
|
||||||
|
.replaceAll(
|
||||||
|
"/services/testConnectionDefinitions/[^/]+",
|
||||||
|
"/services/testConnectionDefinitions/{connectionType}")
|
||||||
|
// Governance workflow paths
|
||||||
|
.replaceAll(
|
||||||
|
"/governance/[^/]+/[^/]+/[^/]+",
|
||||||
|
"/governance/{workflowType}/{definitionName}/{instanceId}")
|
||||||
|
// Drive/file management paths
|
||||||
|
.replaceAll("/drives/[^/]+/[^/]+/[^/]+", "/drives/{type}/{id}/{subresource}")
|
||||||
|
// Universal entity sub-resources (versions, followers, results, etc.)
|
||||||
|
.replaceAll("/([^/]+)/([^/]+)/versions/[^/]+", "/$1/$2/versions/{version}")
|
||||||
|
.replaceAll("/([^/]+)/([^/]+)/followers/[^/]+", "/$1/$2/followers/{userId}")
|
||||||
|
.replaceAll("/([^/]+)/([^/]+)/results/[^/]+", "/$1/$2/results/{result}")
|
||||||
|
.replaceAll(
|
||||||
|
"/([^/]+)/([^/]+)/results/before/[^/]+", "/$1/$2/results/before/{timestamp}")
|
||||||
|
.replaceAll(
|
||||||
|
"/([^/]+)/name/([^/]+)/(export|import|exportAsync|importAsync)", "/$1/name/$2/$3")
|
||||||
|
// SCIM paths
|
||||||
|
.replaceAll("/scim/(Users|Groups)/[^/]+", "/scim/$1/{id}")
|
||||||
|
// Permission resource patterns
|
||||||
|
.replaceAll("/permissions/[^/]+/[^/]+", "/permissions/{resource}/{id}")
|
||||||
|
.replaceAll("/permissions/[^/]+/name/[^/]+", "/permissions/{resource}/name/{name}")
|
||||||
|
.replaceAll("/permissions/view/[^/]+", "/permissions/view/{entityType}")
|
||||||
|
.replaceAll("/permissions/debug/user/[^/]+", "/permissions/debug/user/{username}")
|
||||||
|
.replaceAll("/permissions/debug/evaluate", "/permissions/debug/evaluate")
|
||||||
|
.replaceAll("/permissions/[^/]+", "/permissions/{resource}")
|
||||||
|
// EventSubscription complex patterns (HIGH PRIORITY - prevents cardinality explosion)
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/name/[^/]+/status/[^/]+",
|
||||||
|
"/events/subscriptions/name/{name}/status/{destinationId}")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/[^/]+/status/[^/]+",
|
||||||
|
"/events/subscriptions/{id}/status/{destinationId}")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/[^/]+/resources",
|
||||||
|
"/events/subscriptions/{alertType}/resources")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/id/[^/]+/listEvents",
|
||||||
|
"/events/subscriptions/id/{id}/listEvents")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/id/[^/]+/eventsRecord",
|
||||||
|
"/events/subscriptions/id/{subscriptionId}/eventsRecord")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/name/[^/]+/eventsRecord",
|
||||||
|
"/events/subscriptions/name/{subscriptionName}/eventsRecord")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/id/[^/]+/diagnosticInfo",
|
||||||
|
"/events/subscriptions/id/{subscriptionId}/diagnosticInfo")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/name/[^/]+/diagnosticInfo",
|
||||||
|
"/events/subscriptions/name/{subscriptionName}/diagnosticInfo")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/id/[^/]+/failedEvents",
|
||||||
|
"/events/subscriptions/id/{id}/failedEvents")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/name/[^/]+/failedEvents",
|
||||||
|
"/events/subscriptions/name/{eventSubscriptionName}/failedEvents")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/id/[^/]+/listSuccessfullySentChangeEvents",
|
||||||
|
"/events/subscriptions/id/{id}/listSuccessfullySentChangeEvents")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/name/[^/]+/listSuccessfullySentChangeEvents",
|
||||||
|
"/events/subscriptions/name/{eventSubscriptionName}/listSuccessfullySentChangeEvents")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/id/[^/]+/destinations",
|
||||||
|
"/events/subscriptions/id/{eventSubscriptionId}/destinations")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/name/[^/]+/destinations",
|
||||||
|
"/events/subscriptions/name/{eventSubscriptionName}/destinations")
|
||||||
|
.replaceAll(
|
||||||
|
"/events/subscriptions/name/[^/]+/syncOffset",
|
||||||
|
"/events/subscriptions/name/{eventSubscriptionName}/syncOffset")
|
||||||
|
// App management patterns
|
||||||
|
.replaceAll("/apps/name/[^/]+/status", "/apps/name/{name}/status")
|
||||||
|
.replaceAll("/apps/name/[^/]+/extension", "/apps/name/{name}/extension")
|
||||||
|
.replaceAll("/apps/name/[^/]+/logs", "/apps/name/{name}/logs")
|
||||||
|
.replaceAll("/apps/name/[^/]+/runs/latest", "/apps/name/{name}/runs/latest")
|
||||||
|
.replaceAll("/apps/schedule/[^/]+", "/apps/schedule/{name}")
|
||||||
|
.replaceAll("/apps/configure/[^/]+", "/apps/configure/{name}")
|
||||||
|
.replaceAll("/apps/trigger/[^/]+", "/apps/trigger/{name}")
|
||||||
|
.replaceAll("/apps/stop/[^/]+", "/apps/stop/{name}")
|
||||||
|
.replaceAll("/apps/deploy/[^/]+", "/apps/deploy/{name}")
|
||||||
|
// IngestionPipeline operational patterns
|
||||||
|
.replaceAll(
|
||||||
|
"/services/ingestionPipelines/deploy/[^/]+",
|
||||||
|
"/services/ingestionPipelines/deploy/{id}")
|
||||||
|
.replaceAll(
|
||||||
|
"/services/ingestionPipelines/trigger/[^/]+",
|
||||||
|
"/services/ingestionPipelines/trigger/{id}")
|
||||||
|
.replaceAll(
|
||||||
|
"/services/ingestionPipelines/toggleIngestion/[^/]+",
|
||||||
|
"/services/ingestionPipelines/toggleIngestion/{id}")
|
||||||
|
.replaceAll(
|
||||||
|
"/services/ingestionPipelines/kill/[^/]+", "/services/ingestionPipelines/kill/{id}")
|
||||||
|
.replaceAll(
|
||||||
|
"/services/ingestionPipelines/logs/[^/]+/last",
|
||||||
|
"/services/ingestionPipelines/logs/{id}/last")
|
||||||
|
.replaceAll(
|
||||||
|
"/services/ingestionPipelines/[^/]+/pipelineStatus/[^/]+",
|
||||||
|
"/services/ingestionPipelines/{fqn}/pipelineStatus/{id}")
|
||||||
|
.replaceAll(
|
||||||
|
"/services/ingestionPipelines/[^/]+/pipelineStatus",
|
||||||
|
"/services/ingestionPipelines/{fqn}/pipelineStatus")
|
||||||
|
// Search resource patterns
|
||||||
|
.replaceAll("/search/get/[^/]+/doc/[^/]+", "/search/get/{index}/doc/{id}")
|
||||||
|
// User authentication & security patterns
|
||||||
|
.replaceAll("/users/generateToken/[^/]+", "/users/generateToken/{id}")
|
||||||
|
.replaceAll("/users/token/[^/]+", "/users/token/{id}")
|
||||||
|
.replaceAll("/users/auth-mechanism/[^/]+", "/users/auth-mechanism/{id}")
|
||||||
|
// Feed & discussion patterns
|
||||||
|
.replaceAll("/feed/tasks/[^/]+/resolve", "/feed/tasks/{id}/resolve")
|
||||||
|
.replaceAll("/feed/tasks/[^/]+/close", "/feed/tasks/{id}/close")
|
||||||
|
.replaceAll("/feed/tasks/[^/]+", "/feed/tasks/{id}")
|
||||||
|
.replaceAll("/feed/[^/]+/posts/[^/]+", "/feed/{threadId}/posts/{postId}")
|
||||||
|
.replaceAll("/feed/[^/]+/posts", "/feed/{id}/posts")
|
||||||
|
.replaceAll("/feed/[^/]+", "/feed/{threadId}")
|
||||||
|
// System & configuration patterns
|
||||||
|
.replaceAll("/system/settings/[^/]+", "/system/settings/{name}")
|
||||||
|
.replaceAll("/system/settings/reset/[^/]+", "/system/settings/reset/{name}")
|
||||||
|
// DocStore patterns
|
||||||
|
.replaceAll(
|
||||||
|
"/docStore/validateTemplate/[^/]+", "/docStore/validateTemplate/{templateName}")
|
||||||
|
// Handle remaining timestamp patterns
|
||||||
|
.replaceAll("/[0-9]{10,13}", "/{timestamp}");
|
||||||
|
|
||||||
|
// Ensure we don't have empty path segments
|
||||||
|
normalizedUri = normalizedUri.replaceAll("/+", "/");
|
||||||
|
|
||||||
|
// Limit to reasonable URI length to prevent edge cases
|
||||||
|
if (normalizedUri.length() > 100) {
|
||||||
|
// For very long URIs, just use the first few path segments
|
||||||
|
String[] segments = normalizedUri.split("/");
|
||||||
|
if (segments.length > 5) {
|
||||||
|
normalizedUri = String.join("/", java.util.Arrays.copyOfRange(segments, 0, 5)) + "/...";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalizedUri.isEmpty() ? "/" : normalizedUri;
|
||||||
|
}
|
||||||
|
}
|
||||||
@ -138,8 +138,7 @@ public class MicrometerBundle implements ConfiguredBundle<OpenMetadataApplicatio
|
|||||||
// HTTP request histogram
|
// HTTP request histogram
|
||||||
io.micrometer.core.instrument.Timer.builder("http_server_requests_sec")
|
io.micrometer.core.instrument.Timer.builder("http_server_requests_sec")
|
||||||
.description("HTTP methods duration")
|
.description("HTTP methods duration")
|
||||||
.publishPercentileHistogram()
|
.sla(
|
||||||
.serviceLevelObjectives(
|
|
||||||
java.time.Duration.ofMillis(10),
|
java.time.Duration.ofMillis(10),
|
||||||
java.time.Duration.ofMillis(100),
|
java.time.Duration.ofMillis(100),
|
||||||
java.time.Duration.ofSeconds(1),
|
java.time.Duration.ofSeconds(1),
|
||||||
@ -153,8 +152,7 @@ public class MicrometerBundle implements ConfiguredBundle<OpenMetadataApplicatio
|
|||||||
// JDBI request histogram
|
// JDBI request histogram
|
||||||
io.micrometer.core.instrument.Timer.builder("jdbi_requests_seconds")
|
io.micrometer.core.instrument.Timer.builder("jdbi_requests_seconds")
|
||||||
.description("jdbi requests duration distribution")
|
.description("jdbi requests duration distribution")
|
||||||
.publishPercentileHistogram()
|
.sla(
|
||||||
.serviceLevelObjectives(
|
|
||||||
java.time.Duration.ofMillis(10),
|
java.time.Duration.ofMillis(10),
|
||||||
java.time.Duration.ofMillis(100),
|
java.time.Duration.ofMillis(100),
|
||||||
java.time.Duration.ofSeconds(1),
|
java.time.Duration.ofSeconds(1),
|
||||||
|
|||||||
@ -1,5 +1,8 @@
|
|||||||
package org.openmetadata.service.monitoring;
|
package org.openmetadata.service.monitoring;
|
||||||
|
|
||||||
|
import static org.openmetadata.service.monitoring.MetricUtils.LATENCY_SLA_BUCKETS;
|
||||||
|
import static org.openmetadata.service.monitoring.MetricUtils.normalizeUri;
|
||||||
|
|
||||||
import io.micrometer.core.instrument.Counter;
|
import io.micrometer.core.instrument.Counter;
|
||||||
import io.micrometer.core.instrument.DistributionSummary;
|
import io.micrometer.core.instrument.DistributionSummary;
|
||||||
import io.micrometer.core.instrument.MeterRegistry;
|
import io.micrometer.core.instrument.MeterRegistry;
|
||||||
@ -51,12 +54,7 @@ public class OpenMetadataMetrics {
|
|||||||
this.httpRequestTimer =
|
this.httpRequestTimer =
|
||||||
Timer.builder("http.server.requests")
|
Timer.builder("http.server.requests")
|
||||||
.description("HTTP server request duration")
|
.description("HTTP server request duration")
|
||||||
.publishPercentileHistogram()
|
.sla(LATENCY_SLA_BUCKETS)
|
||||||
.serviceLevelObjectives(
|
|
||||||
Duration.ofMillis(50),
|
|
||||||
Duration.ofMillis(100),
|
|
||||||
Duration.ofMillis(500),
|
|
||||||
Duration.ofSeconds(1))
|
|
||||||
.register(meterRegistry);
|
.register(meterRegistry);
|
||||||
|
|
||||||
this.httpRequestCounter =
|
this.httpRequestCounter =
|
||||||
@ -68,16 +66,14 @@ public class OpenMetadataMetrics {
|
|||||||
DistributionSummary.builder("http.server.response.size")
|
DistributionSummary.builder("http.server.response.size")
|
||||||
.description("HTTP response size in bytes")
|
.description("HTTP response size in bytes")
|
||||||
.baseUnit("bytes")
|
.baseUnit("bytes")
|
||||||
.publishPercentileHistogram()
|
.serviceLevelObjectives(1024, 4096, 16384, 65536, 262144, 1048576, 4194304, 16777216)
|
||||||
.register(meterRegistry);
|
.register(meterRegistry);
|
||||||
|
|
||||||
// Initialize Database metrics
|
// Initialize Database metrics
|
||||||
this.jdbiQueryTimer =
|
this.jdbiQueryTimer =
|
||||||
Timer.builder("db.query.duration")
|
Timer.builder("db.query.duration")
|
||||||
.description("Database query duration")
|
.description("Database query duration")
|
||||||
.publishPercentileHistogram()
|
.sla(LATENCY_SLA_BUCKETS)
|
||||||
.serviceLevelObjectives(
|
|
||||||
Duration.ofMillis(10), Duration.ofMillis(50), Duration.ofMillis(100))
|
|
||||||
.register(meterRegistry);
|
.register(meterRegistry);
|
||||||
|
|
||||||
this.jdbiConnectionCounter =
|
this.jdbiConnectionCounter =
|
||||||
@ -123,7 +119,7 @@ public class OpenMetadataMetrics {
|
|||||||
this.pipelineExecutionTimer =
|
this.pipelineExecutionTimer =
|
||||||
Timer.builder("pipeline.execution.duration")
|
Timer.builder("pipeline.execution.duration")
|
||||||
.description("Pipeline execution duration")
|
.description("Pipeline execution duration")
|
||||||
.publishPercentileHistogram()
|
.sla(LATENCY_SLA_BUCKETS)
|
||||||
.register(meterRegistry);
|
.register(meterRegistry);
|
||||||
|
|
||||||
// Initialize Authentication metrics
|
// Initialize Authentication metrics
|
||||||
@ -265,14 +261,6 @@ public class OpenMetadataMetrics {
|
|||||||
.register(meterRegistry);
|
.register(meterRegistry);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Utility methods
|
|
||||||
private String normalizeUri(String uri) {
|
|
||||||
// Normalize URIs to avoid high cardinality
|
|
||||||
// Replace IDs with placeholders
|
|
||||||
return uri.replaceAll("/[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}", "/{id}")
|
|
||||||
.replaceAll("/[0-9]+", "/{id}");
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getStatusClass(int status) {
|
private String getStatusClass(int status) {
|
||||||
if (status >= 100 && status < 200) return "1xx";
|
if (status >= 100 && status < 200) return "1xx";
|
||||||
if (status >= 200 && status < 300) return "2xx";
|
if (status >= 200 && status < 300) return "2xx";
|
||||||
|
|||||||
@ -1,5 +1,8 @@
|
|||||||
package org.openmetadata.service.monitoring;
|
package org.openmetadata.service.monitoring;
|
||||||
|
|
||||||
|
import static org.openmetadata.service.monitoring.MetricUtils.LATENCY_SLA_BUCKETS;
|
||||||
|
import static org.openmetadata.service.monitoring.MetricUtils.normalizeUri;
|
||||||
|
|
||||||
import io.micrometer.core.instrument.Gauge;
|
import io.micrometer.core.instrument.Gauge;
|
||||||
import io.micrometer.core.instrument.Metrics;
|
import io.micrometer.core.instrument.Metrics;
|
||||||
import io.micrometer.core.instrument.Timer;
|
import io.micrometer.core.instrument.Timer;
|
||||||
@ -48,14 +51,14 @@ public class RequestLatencyContext {
|
|||||||
public static void startRequest(String endpoint) {
|
public static void startRequest(String endpoint) {
|
||||||
RequestContext context = new RequestContext(endpoint);
|
RequestContext context = new RequestContext(endpoint);
|
||||||
requestContext.set(context);
|
requestContext.set(context);
|
||||||
|
String normalizedEndpoint = normalizeUri(endpoint);
|
||||||
requestTimers.computeIfAbsent(
|
requestTimers.computeIfAbsent(
|
||||||
endpoint,
|
normalizedEndpoint,
|
||||||
k ->
|
k ->
|
||||||
Timer.builder("request.latency.total")
|
Timer.builder("request.latency.total")
|
||||||
.tag(ENDPOINT, endpoint)
|
.tag(ENDPOINT, normalizedEndpoint)
|
||||||
.description("Total request latency")
|
.description("Total request latency")
|
||||||
.publishPercentileHistogram()
|
.sla(LATENCY_SLA_BUCKETS)
|
||||||
.register(Metrics.globalRegistry));
|
.register(Metrics.globalRegistry));
|
||||||
context.requestTimerSample = Timer.start(Metrics.globalRegistry);
|
context.requestTimerSample = Timer.start(Metrics.globalRegistry);
|
||||||
context.internalTimerStartNanos = System.nanoTime();
|
context.internalTimerStartNanos = System.nanoTime();
|
||||||
@ -128,10 +131,11 @@ public class RequestLatencyContext {
|
|||||||
RequestContext context = requestContext.get();
|
RequestContext context = requestContext.get();
|
||||||
if (context == null) return;
|
if (context == null) return;
|
||||||
|
|
||||||
|
String normalizedEndpoint = normalizeUri(context.endpoint);
|
||||||
try {
|
try {
|
||||||
// Stop request timer
|
// Stop request timer
|
||||||
if (context.requestTimerSample != null) {
|
if (context.requestTimerSample != null) {
|
||||||
Timer requestTimer = requestTimers.get(context.endpoint);
|
Timer requestTimer = requestTimers.get(normalizedEndpoint);
|
||||||
if (requestTimer != null) {
|
if (requestTimer != null) {
|
||||||
context.totalTime = context.requestTimerSample.stop(requestTimer);
|
context.totalTime = context.requestTimerSample.stop(requestTimer);
|
||||||
}
|
}
|
||||||
@ -145,47 +149,47 @@ public class RequestLatencyContext {
|
|||||||
// This gives us the total DB time for THIS request
|
// This gives us the total DB time for THIS request
|
||||||
Timer dbTimer =
|
Timer dbTimer =
|
||||||
databaseTimers.computeIfAbsent(
|
databaseTimers.computeIfAbsent(
|
||||||
context.endpoint,
|
normalizedEndpoint,
|
||||||
k ->
|
k ->
|
||||||
Timer.builder("request.latency.database")
|
Timer.builder("request.latency.database")
|
||||||
.tag(ENDPOINT, context.endpoint)
|
.tag(ENDPOINT, normalizedEndpoint)
|
||||||
.description("Total database latency per request")
|
.description("Total database latency per request")
|
||||||
.publishPercentileHistogram()
|
.sla(LATENCY_SLA_BUCKETS)
|
||||||
.register(Metrics.globalRegistry));
|
.register(Metrics.globalRegistry));
|
||||||
dbTimer.record(context.dbTime, java.util.concurrent.TimeUnit.NANOSECONDS);
|
dbTimer.record(context.dbTime, java.util.concurrent.TimeUnit.NANOSECONDS);
|
||||||
|
|
||||||
// Record total search time for THIS request
|
// Record total search time for THIS request
|
||||||
Timer searchTimer =
|
Timer searchTimer =
|
||||||
searchTimers.computeIfAbsent(
|
searchTimers.computeIfAbsent(
|
||||||
context.endpoint,
|
normalizedEndpoint,
|
||||||
k ->
|
k ->
|
||||||
Timer.builder("request.latency.search")
|
Timer.builder("request.latency.search")
|
||||||
.tag(ENDPOINT, context.endpoint)
|
.tag(ENDPOINT, normalizedEndpoint)
|
||||||
.description("Total search latency per request")
|
.description("Total search latency per request")
|
||||||
.publishPercentileHistogram()
|
.sla(LATENCY_SLA_BUCKETS)
|
||||||
.register(Metrics.globalRegistry));
|
.register(Metrics.globalRegistry));
|
||||||
searchTimer.record(context.searchTime, java.util.concurrent.TimeUnit.NANOSECONDS);
|
searchTimer.record(context.searchTime, java.util.concurrent.TimeUnit.NANOSECONDS);
|
||||||
|
|
||||||
// Record internal processing time for THIS request
|
// Record internal processing time for THIS request
|
||||||
Timer internalTimer =
|
Timer internalTimer =
|
||||||
internalTimers.computeIfAbsent(
|
internalTimers.computeIfAbsent(
|
||||||
context.endpoint,
|
normalizedEndpoint,
|
||||||
k ->
|
k ->
|
||||||
Timer.builder("request.latency.internal")
|
Timer.builder("request.latency.internal")
|
||||||
.tag(ENDPOINT, context.endpoint)
|
.tag(ENDPOINT, normalizedEndpoint)
|
||||||
.description("Internal processing latency per request")
|
.description("Internal processing latency per request")
|
||||||
.publishPercentileHistogram()
|
.sla(LATENCY_SLA_BUCKETS)
|
||||||
.register(Metrics.globalRegistry));
|
.register(Metrics.globalRegistry));
|
||||||
internalTimer.record(context.internalTime, java.util.concurrent.TimeUnit.NANOSECONDS);
|
internalTimer.record(context.internalTime, java.util.concurrent.TimeUnit.NANOSECONDS);
|
||||||
|
|
||||||
// Record operation counts as distribution summaries to get avg/max/percentiles
|
// Record operation counts as distribution summaries to get avg/max/percentiles
|
||||||
if (context.dbOperationCount > 0) {
|
if (context.dbOperationCount > 0) {
|
||||||
Metrics.summary("request.operations.database", ENDPOINT, context.endpoint)
|
Metrics.summary("request.operations.database", ENDPOINT, normalizedEndpoint)
|
||||||
.record(context.dbOperationCount);
|
.record(context.dbOperationCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (context.searchOperationCount > 0) {
|
if (context.searchOperationCount > 0) {
|
||||||
Metrics.summary("request.operations.search", ENDPOINT, context.endpoint)
|
Metrics.summary("request.operations.search", ENDPOINT, normalizedEndpoint)
|
||||||
.record(context.searchOperationCount);
|
.record(context.searchOperationCount);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -198,23 +202,23 @@ public class RequestLatencyContext {
|
|||||||
// Get or create percentage holder for this endpoint
|
// Get or create percentage holder for this endpoint
|
||||||
PercentageHolder holder =
|
PercentageHolder holder =
|
||||||
percentageHolders.computeIfAbsent(
|
percentageHolders.computeIfAbsent(
|
||||||
context.endpoint,
|
normalizedEndpoint,
|
||||||
k -> {
|
k -> {
|
||||||
PercentageHolder newHolder = new PercentageHolder();
|
PercentageHolder newHolder = new PercentageHolder();
|
||||||
|
|
||||||
// Register gauges that read from the atomic references
|
// Register gauges that read from the atomic references
|
||||||
Gauge.builder("request.percentage.database", newHolder.databasePercent::get)
|
Gauge.builder("request.percentage.database", newHolder.databasePercent::get)
|
||||||
.tag(ENDPOINT, context.endpoint)
|
.tag(ENDPOINT, normalizedEndpoint)
|
||||||
.description("Percentage of request time spent in database operations")
|
.description("Percentage of request time spent in database operations")
|
||||||
.register(Metrics.globalRegistry);
|
.register(Metrics.globalRegistry);
|
||||||
|
|
||||||
Gauge.builder("request.percentage.search", newHolder.searchPercent::get)
|
Gauge.builder("request.percentage.search", newHolder.searchPercent::get)
|
||||||
.tag(ENDPOINT, context.endpoint)
|
.tag(ENDPOINT, normalizedEndpoint)
|
||||||
.description("Percentage of request time spent in search operations")
|
.description("Percentage of request time spent in search operations")
|
||||||
.register(Metrics.globalRegistry);
|
.register(Metrics.globalRegistry);
|
||||||
|
|
||||||
Gauge.builder("request.percentage.internal", newHolder.internalPercent::get)
|
Gauge.builder("request.percentage.internal", newHolder.internalPercent::get)
|
||||||
.tag(ENDPOINT, context.endpoint)
|
.tag(ENDPOINT, normalizedEndpoint)
|
||||||
.description("Percentage of request time spent in internal processing")
|
.description("Percentage of request time spent in internal processing")
|
||||||
.register(Metrics.globalRegistry);
|
.register(Metrics.globalRegistry);
|
||||||
|
|
||||||
|
|||||||
@ -39,10 +39,11 @@ class RequestLatencyContextTest {
|
|||||||
|
|
||||||
simulateWork(20);
|
simulateWork(20);
|
||||||
|
|
||||||
|
String normalizedEndpoint = MetricUtils.normalizeUri(endpoint);
|
||||||
RequestLatencyContext.endRequest();
|
RequestLatencyContext.endRequest();
|
||||||
Timer totalTimer = Metrics.timer("request.latency.total", "endpoint", endpoint);
|
Timer totalTimer = Metrics.timer("request.latency.total", "endpoint", normalizedEndpoint);
|
||||||
Timer dbTimer = Metrics.timer("request.latency.database", "endpoint", endpoint);
|
Timer dbTimer = Metrics.timer("request.latency.database", "endpoint", normalizedEndpoint);
|
||||||
Timer internalTimer = Metrics.timer("request.latency.internal", "endpoint", endpoint);
|
Timer internalTimer = Metrics.timer("request.latency.internal", "endpoint", normalizedEndpoint);
|
||||||
|
|
||||||
assertEquals(1, totalTimer.count(), "Should have recorded 1 request");
|
assertEquals(1, totalTimer.count(), "Should have recorded 1 request");
|
||||||
assertEquals(1, dbTimer.count(), "Should have recorded 1 request with database operations");
|
assertEquals(1, dbTimer.count(), "Should have recorded 1 request with database operations");
|
||||||
|
|||||||
@ -34,15 +34,16 @@ class RequestLatencyTrackingSimpleTest {
|
|||||||
simulateWork(30);
|
simulateWork(30);
|
||||||
RequestLatencyContext.endRequest();
|
RequestLatencyContext.endRequest();
|
||||||
|
|
||||||
Timer totalTimer = Metrics.timer("request.latency.total", "endpoint", endpoint);
|
String normalizedEndpoint = MetricUtils.normalizeUri(endpoint);
|
||||||
|
Timer totalTimer = Metrics.timer("request.latency.total", "endpoint", normalizedEndpoint);
|
||||||
assertNotNull(totalTimer);
|
assertNotNull(totalTimer);
|
||||||
assertEquals(1, totalTimer.count(), "Should have recorded 1 request");
|
assertEquals(1, totalTimer.count(), "Should have recorded 1 request");
|
||||||
|
|
||||||
Timer dbTimer = Metrics.timer("request.latency.database", "endpoint", endpoint);
|
Timer dbTimer = Metrics.timer("request.latency.database", "endpoint", normalizedEndpoint);
|
||||||
assertNotNull(dbTimer);
|
assertNotNull(dbTimer);
|
||||||
assertEquals(1, dbTimer.count(), "Should have recorded 1 database operation");
|
assertEquals(1, dbTimer.count(), "Should have recorded 1 database operation");
|
||||||
|
|
||||||
Timer internalTimer = Metrics.timer("request.latency.internal", "endpoint", endpoint);
|
Timer internalTimer = Metrics.timer("request.latency.internal", "endpoint", normalizedEndpoint);
|
||||||
assertNotNull(internalTimer);
|
assertNotNull(internalTimer);
|
||||||
assertEquals(1, internalTimer.count(), "Should have recorded internal processing");
|
assertEquals(1, internalTimer.count(), "Should have recorded internal processing");
|
||||||
|
|
||||||
@ -54,9 +55,14 @@ class RequestLatencyTrackingSimpleTest {
|
|||||||
LOG.info("Database time: {} ms", dbMs);
|
LOG.info("Database time: {} ms", dbMs);
|
||||||
LOG.info("Internal time: {} ms", internalMs);
|
LOG.info("Internal time: {} ms", internalMs);
|
||||||
|
|
||||||
assertTrue(totalMs >= 150 && totalMs <= 210, "Total time should be ~180ms, got: " + totalMs);
|
// Timing expectations: 500ms + 100ms + 30ms = 630ms total
|
||||||
assertTrue(dbMs >= 80 && dbMs <= 120, "Database time should be ~100ms, got: " + dbMs);
|
// DB time: 100ms during database operation
|
||||||
|
// Internal time: 500ms (before DB) + 30ms (after DB) = 530ms
|
||||||
|
// Allow generous bounds for system timing variations
|
||||||
|
assertTrue(totalMs >= 500 && totalMs <= 1000, "Total time should be ~630ms, got: " + totalMs);
|
||||||
|
assertTrue(dbMs >= 80 && dbMs <= 150, "Database time should be ~100ms, got: " + dbMs);
|
||||||
assertTrue(
|
assertTrue(
|
||||||
internalMs >= 60 && internalMs <= 100, "Internal time should be ~80ms, got: " + internalMs);
|
internalMs >= 400 && internalMs <= 700,
|
||||||
|
"Internal time should be ~530ms, got: " + internalMs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -123,10 +123,11 @@ class RequestLatencyTrackingTest extends OpenMetadataApplicationTest {
|
|||||||
// Check for specific endpoint metrics - the actual metrics use the endpoint path as-is
|
// Check for specific endpoint metrics - the actual metrics use the endpoint path as-is
|
||||||
LOG.info("Looking for metrics with endpoint: {}", getEndpoint);
|
LOG.info("Looking for metrics with endpoint: {}", getEndpoint);
|
||||||
|
|
||||||
|
String normalizedUri = MetricUtils.normalizeUri("v1/tables/" + createdTable.getId());
|
||||||
// Parse and verify latency metrics
|
// Parse and verify latency metrics
|
||||||
assertLatencyMetricsExist(prometheusMetrics, "request_latency_total", getEndpoint);
|
assertLatencyMetricsExist(prometheusMetrics, "request_latency_total", normalizedUri);
|
||||||
assertLatencyMetricsExist(prometheusMetrics, "request_latency_database", getEndpoint);
|
assertLatencyMetricsExist(prometheusMetrics, "request_latency_database", normalizedUri);
|
||||||
assertLatencyMetricsExist(prometheusMetrics, "request_latency_internal", getEndpoint);
|
assertLatencyMetricsExist(prometheusMetrics, "request_latency_internal", normalizedUri);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -205,7 +206,7 @@ class RequestLatencyTrackingTest extends OpenMetadataApplicationTest {
|
|||||||
|
|
||||||
WebTarget complexTarget =
|
WebTarget complexTarget =
|
||||||
getResource("tables/" + createdTable.getId())
|
getResource("tables/" + createdTable.getId())
|
||||||
.queryParam("fields", "owners,tags,followers,columns,domain,dataProducts,extension")
|
.queryParam("fields", "owners,tags,followers,columns,domains,dataProducts,extension")
|
||||||
.queryParam("include", "all");
|
.queryParam("include", "all");
|
||||||
TestUtils.get(complexTarget, Table.class, ADMIN_AUTH_HEADERS);
|
TestUtils.get(complexTarget, Table.class, ADMIN_AUTH_HEADERS);
|
||||||
|
|
||||||
@ -218,7 +219,8 @@ class RequestLatencyTrackingTest extends OpenMetadataApplicationTest {
|
|||||||
String endpoint = "v1/tables/" + createdTable.getId();
|
String endpoint = "v1/tables/" + createdTable.getId();
|
||||||
|
|
||||||
// Verify database operation count
|
// Verify database operation count
|
||||||
assertLatencyMetricsExist(prometheusMetrics, "request_latency_database", endpoint);
|
assertLatencyMetricsExist(
|
||||||
|
prometheusMetrics, "request_latency_database", MetricUtils.normalizeUri(endpoint));
|
||||||
|
|
||||||
// Check that we have multiple database operations recorded
|
// Check that we have multiple database operations recorded
|
||||||
assertTrue(
|
assertTrue(
|
||||||
@ -247,7 +249,9 @@ class RequestLatencyTrackingTest extends OpenMetadataApplicationTest {
|
|||||||
private void assertLatencyMetricsExist(
|
private void assertLatencyMetricsExist(
|
||||||
String prometheusOutput, String metricName, String endpoint) {
|
String prometheusOutput, String metricName, String endpoint) {
|
||||||
// Look for metrics that contain the metric name with the endpoint label
|
// Look for metrics that contain the metric name with the endpoint label
|
||||||
String pattern = metricName + "_seconds.*endpoint=\"" + endpoint.replace("/", "\\/") + "\"";
|
// Escape regex special characters in the endpoint string
|
||||||
|
String escapedEndpoint = java.util.regex.Pattern.quote(endpoint);
|
||||||
|
String pattern = metricName + "_seconds.*endpoint=\"" + escapedEndpoint + "\"";
|
||||||
assertTrue(
|
assertTrue(
|
||||||
prometheusOutput.matches("(?s).*" + pattern + ".*"),
|
prometheusOutput.matches("(?s).*" + pattern + ".*"),
|
||||||
String.format(
|
String.format(
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user