diff --git a/openmetadata-docs/content/v1.1.0-snapshot/releases/roadmap/index.md b/openmetadata-docs/content/v1.1.0-snapshot/releases/roadmap/index.md
index f7afe54d1a6..a86d8554e52 100644
--- a/openmetadata-docs/content/v1.1.0-snapshot/releases/roadmap/index.md
+++ b/openmetadata-docs/content/v1.1.0-snapshot/releases/roadmap/index.md
@@ -22,7 +22,7 @@ You can check the latest release [here](/releases/all-releases).
- We will be adding support for NoSQL DB entities and Services with APIs
- Support for Long Entity Names such as S3 paths
- Import/Export support at all entities
-- Tag Propgation using Import/Export
+- Tag Propagation using Import/Export
- Thumbs up & down to capture popularity of the Entities
{% /tile %}
diff --git a/openmetadata-service/pom.xml b/openmetadata-service/pom.xml
index a435b556f38..fb61132ef84 100644
--- a/openmetadata-service/pom.xml
+++ b/openmetadata-service/pom.xml
@@ -227,6 +227,10 @@
org.flywaydb
flyway-mysql
+
+ io.github.resilience4j
+ resilience4j-retry
+
diff --git a/openmetadata-service/src/main/java/org/openmetadata/service/events/scheduled/PipelineServiceStatusJob.java b/openmetadata-service/src/main/java/org/openmetadata/service/events/scheduled/PipelineServiceStatusJob.java
index 855f7db572f..dc727f02890 100644
--- a/openmetadata-service/src/main/java/org/openmetadata/service/events/scheduled/PipelineServiceStatusJob.java
+++ b/openmetadata-service/src/main/java/org/openmetadata/service/events/scheduled/PipelineServiceStatusJob.java
@@ -4,9 +4,13 @@ import static org.openmetadata.service.events.scheduled.PipelineServiceStatusJob
import static org.openmetadata.service.events.scheduled.PipelineServiceStatusJobHandler.JOB_CONTEXT_METER_REGISTRY;
import static org.openmetadata.service.events.scheduled.PipelineServiceStatusJobHandler.JOB_CONTEXT_PIPELINE_SERVICE_CLIENT;
+import io.github.resilience4j.retry.Retry;
+import io.github.resilience4j.retry.RetryConfig;
import io.micrometer.core.instrument.Counter;
import io.micrometer.prometheus.PrometheusMeterRegistry;
+import java.time.Duration;
import java.util.Map;
+import java.util.function.Supplier;
import javax.ws.rs.core.Response;
import lombok.extern.slf4j.Slf4j;
import org.openmetadata.sdk.PipelineServiceClient;
@@ -22,6 +26,9 @@ public class PipelineServiceStatusJob implements Job {
private static final String UNHEALTHY_TAG_NAME = "unhealthy";
private static final String CLUSTER_TAG_NAME = "clusterName";
+ private static final Integer MAX_ATTEMPTS = 3;
+ private static final Integer BACKOFF_TIME_SECONDS = 5;
+
@Override
public void execute(JobExecutionContext jobExecutionContext) {
@@ -39,11 +46,35 @@ public class PipelineServiceStatusJob implements Job {
}
}
+ private String getServiceStatus(PipelineServiceClient pipelineServiceClient) {
+ RetryConfig retryConfig =
+ RetryConfig.custom()
+ .maxAttempts(MAX_ATTEMPTS)
+ .waitDuration(Duration.ofMillis(BACKOFF_TIME_SECONDS * 1_000L))
+ .retryOnResult(response -> !HEALTHY_STATUS.equals(response))
+ .failAfterMaxAttempts(false)
+ .build();
+
+ Retry retry = Retry.of("getServiceStatus", retryConfig);
+
+ Supplier responseSupplier =
+ () -> {
+ try {
+ Response response = pipelineServiceClient.getServiceStatus();
+ Map responseMap = (Map) response.getEntity();
+ return responseMap.get(STATUS_KEY) == null ? "unhealthy" : responseMap.get(STATUS_KEY);
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ };
+
+ return retry.executeSupplier(responseSupplier);
+ }
+
private void registerStatusMetric(
PipelineServiceClient pipelineServiceClient, PrometheusMeterRegistry meterRegistry, String clusterName) {
- Response response = pipelineServiceClient.getServiceStatus();
- Map responseMap = (Map) response.getEntity();
- if (responseMap.get(STATUS_KEY) == null || !HEALTHY_STATUS.equals(responseMap.get(STATUS_KEY))) {
+ String status = getServiceStatus(pipelineServiceClient);
+ if (!HEALTHY_STATUS.equals(status)) {
publishUnhealthyCounter(meterRegistry, clusterName);
}
}
diff --git a/pom.xml b/pom.xml
index 58fdd37f604..c4af9f1b07e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -105,6 +105,7 @@
2.6
1.18.26
10.1.7
+ 1.7.0
@@ -458,6 +459,11 @@
${org.junit.jupiter.version}
test
+
+ io.github.resilience4j
+ resilience4j-retry
+ ${resilience4j.version}
+
org.reflections
reflections