fix(ingestion/grafana): stabilize integration tests (#14547)

This commit is contained in:
Jonny Dixon 2025-08-22 19:38:29 +01:00 committed by GitHub
parent 0add552473
commit 88842566cf
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 179 additions and 46 deletions

View File

@ -14,12 +14,19 @@ services:
- GF_AUTH_ANONYMOUS_ENABLED=true
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
- GF_FEATURE_TOGGLES_ENABLE=publicDashboards
# Improve startup stability in CI
- GF_LOG_LEVEL=info
- GF_SERVER_HTTP_PORT=3000
- GF_DATABASE_WAL=true
# Reduce resource usage for CI
- GF_ANALYTICS_REPORTING_ENABLED=false
- GF_ANALYTICS_CHECK_FOR_UPDATES=false
healthcheck:
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
interval: 10s
timeout: 5s
retries: 6
start_period: 30s
test: ["CMD-SHELL", "wget --no-verbose --tries=3 --timeout=10 --spider http://localhost:3000/api/health && wget --no-verbose --tries=1 --timeout=5 --spider http://localhost:3000/api/org || exit 1"]
interval: 15s
timeout: 10s
retries: 8
start_period: 45s
volumes:
- grafana-storage:/var/lib/grafana
- ./provisioning:/etc/grafana/provisioning
@ -39,11 +46,13 @@ services:
POSTGRES_DB: grafana
POSTGRES_USER: grafana
POSTGRES_PASSWORD: grafana
POSTGRES_INITDB_ARGS: "--data-checksums"
healthcheck:
test: [ "CMD-SHELL", "pg_isready -U grafana -d grafana" ]
interval: 10s
timeout: 5s
retries: 5
timeout: 10s
retries: 8
start_period: 30s
volumes:
- postgres-storage:/var/lib/postgresql/data
- ./postgres-init:/docker-entrypoint-initdb.d
@ -53,6 +62,8 @@ services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus
ports:
- "9090:9090"
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
command:
@ -60,6 +71,8 @@ services:
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--web.enable-lifecycle'
networks:
- grafana-network

View File

@ -6,11 +6,13 @@ import pytest
import pytest_docker.plugin
import requests
from freezegun import freeze_time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from datahub.ingestion.run.pipeline import Pipeline
from datahub.testing import mce_helpers
from tests.test_helpers import fs_helpers
from tests.test_helpers.docker_helpers import cleanup_image
from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port
pytestmark = pytest.mark.integration_batch_2
@ -27,36 +29,78 @@ class GrafanaClient:
"Authorization": f"Basic {b64encode(f'{admin_user}:{admin_password}'.encode()).decode()}",
"Content-Type": "application/json",
}
self.session = requests.Session()
retry_strategy = Retry(
total=5,
backoff_factor=2,
status_forcelist=[500, 502, 503, 504, 429],
allowed_methods=["GET", "POST"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self.session.mount("http://", adapter)
self.session.mount("https://", adapter)
def create_service_account(self, name, role):
def create_service_account(self, name, role, max_retries=5):
service_account_payload = {"name": name, "role": role, "isDisabled": False}
try:
response = requests.post(
f"{self.url}/api/serviceaccounts",
headers=self.headers,
json=service_account_payload,
)
response.raise_for_status()
service_account = response.json()
return service_account
except requests.exceptions.RequestException as e:
logging.error(f"Error creating service account: {e}")
return None
def create_api_key(self, service_account_id, key_name, role):
for attempt in range(max_retries):
try:
response = self.session.post(
f"{self.url}/api/serviceaccounts",
headers=self.headers,
json=service_account_payload,
timeout=15,
)
response.raise_for_status()
service_account = response.json()
logging.info(
f"Successfully created service account '{name}' on attempt {attempt + 1}"
)
return service_account
except requests.exceptions.RequestException as e:
if attempt < max_retries - 1:
wait_time = 2**attempt # Exponential backoff
logging.warning(
f"Attempt {attempt + 1} failed to create service account: {e}. Retrying in {wait_time}s..."
)
time.sleep(wait_time)
else:
logging.error(
f"Failed to create service account after {max_retries} attempts: {e}"
)
return None
return None
def create_api_key(self, service_account_id, key_name, role, max_retries=5):
api_key_payload = {"name": key_name, "role": role}
try:
response = requests.post(
f"{self.url}/api/serviceaccounts/{service_account_id}/tokens",
headers=self.headers,
json=api_key_payload,
)
response.raise_for_status()
api_key = response.json()
return api_key["key"]
except requests.exceptions.RequestException as e:
logging.error(f"Error creating API key: {e}")
return None
for attempt in range(max_retries):
try:
response = self.session.post(
f"{self.url}/api/serviceaccounts/{service_account_id}/tokens",
headers=self.headers,
json=api_key_payload,
timeout=15,
)
response.raise_for_status()
api_key = response.json()
logging.info(
f"Successfully created API key '{key_name}' on attempt {attempt + 1}"
)
return api_key["key"]
except requests.exceptions.RequestException as e:
if attempt < max_retries - 1:
wait_time = 2**attempt # Exponential backoff
logging.warning(
f"Attempt {attempt + 1} failed to create API key: {e}. Retrying in {wait_time}s..."
)
time.sleep(wait_time)
else:
logging.error(
f"Failed to create API key after {max_retries} attempts: {e}"
)
return None
return None
@pytest.fixture(scope="module")
@ -67,11 +111,14 @@ def test_resources_dir(pytestconfig):
@pytest.fixture(scope="module")
def test_api_key(loaded_grafana):
# Get the actual mapped port from Docker services
grafana_port = loaded_grafana.port_for("grafana", 3000)
url = f"http://localhost:{grafana_port}"
url = "http://localhost:3000"
admin_user = "admin"
admin_password = "admin"
# Wait for Grafana to be fully ready before creating service account
verify_grafana_fully_ready(loaded_grafana, timeout=180)
grafana_client = GrafanaClient(url, admin_user, admin_password)
service_account = grafana_client.create_service_account(
@ -96,8 +143,30 @@ def loaded_grafana(docker_compose_runner, test_resources_dir):
with docker_compose_runner(
test_resources_dir / "docker-compose.yml", "grafana"
) as docker_services:
# Docker Compose now waits for health check to pass before considering service ready
# Verify we can access the API endpoints as an additional safety check
# Wait for all services to be ready
wait_for_port(docker_services, "postgres", 5432, timeout=90)
# Prometheus container doesn't have bash, so use a simple HTTP check
def check_prometheus_ready():
try:
prometheus_port = docker_services.port_for("prometheus", 9090)
response = requests.get(
f"http://localhost:{prometheus_port}/-/ready", timeout=5
)
return response.status_code == 200
except (requests.exceptions.RequestException, Exception):
return False
wait_for_port(
docker_services,
"prometheus",
9090,
timeout=90,
checker=check_prometheus_ready,
)
wait_for_port(docker_services, "grafana", 3000, timeout=180)
# Additional verification that Grafana API is fully accessible
verify_grafana_api_ready(docker_services)
yield docker_services
@ -106,24 +175,29 @@ def loaded_grafana(docker_compose_runner, test_resources_dir):
def verify_grafana_api_ready(docker_services: pytest_docker.plugin.Services) -> None:
"""Robust verification that Grafana API is fully accessible after health check passes"""
import requests
grafana_port = docker_services.port_for("grafana", 3000)
base_url = f"http://localhost:{grafana_port}"
base_url = "http://localhost:3000"
# Configure requests session with retries
session = requests.Session()
retry_strategy = Retry(
total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
# Wait for API endpoints to be fully ready (health check might pass but API still initializing)
max_attempts = 30
max_attempts = 60
for attempt in range(max_attempts):
try:
# Test both basic API access and service account creation capability
api_url = f"{base_url}/api/search"
resp = requests.get(api_url, auth=("admin", "admin"), timeout=10)
resp = session.get(api_url, auth=("admin", "admin"), timeout=15)
if resp.status_code == 200:
# Also verify service account API is ready (needed for test_api_key fixture)
# Service accounts might not be available in all Grafana versions
sa_url = f"{base_url}/api/serviceaccounts"
sa_resp = requests.get(sa_url, auth=("admin", "admin"), timeout=10)
sa_resp = session.get(sa_url, auth=("admin", "admin"), timeout=15)
if sa_resp.status_code == 200:
logging.info(
@ -147,12 +221,58 @@ def verify_grafana_api_ready(docker_services: pytest_docker.plugin.Services) ->
logging.debug(f"API readiness check failed (attempt {attempt + 1}): {e}")
if attempt < max_attempts - 1:
time.sleep(2)
time.sleep(3)
logging.warning(f"Grafana API may not be fully ready after {max_attempts} attempts")
# Don't fail here - let the test proceed and provide better error info if needed
def verify_grafana_fully_ready(
docker_services: pytest_docker.plugin.Services, timeout: int = 120
) -> None:
"""Extended verification that Grafana is fully ready for service account operations"""
base_url = "http://localhost:3000"
session = requests.Session()
retry_strategy = Retry(
total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504]
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("http://", adapter)
end_time = time.time() + timeout
while time.time() < end_time:
try:
# Test multiple endpoints to ensure full readiness
endpoints_to_check = [
f"{base_url}/api/health",
f"{base_url}/api/org",
f"{base_url}/api/serviceaccounts",
]
all_ready = True
for endpoint in endpoints_to_check:
resp = session.get(endpoint, auth=("admin", "admin"), timeout=10)
if resp.status_code not in [
200,
404,
]: # 404 is OK for service accounts in older versions
all_ready = False
break
if all_ready:
logging.info("Grafana is fully ready for operations")
return
except Exception as e:
logging.debug(f"Grafana readiness check failed: {e}")
time.sleep(2)
logging.warning(f"Grafana may not be fully ready after {timeout}s timeout")
@freeze_time(FROZEN_TIME)
def test_grafana_basic_ingest(
loaded_grafana, pytestconfig, tmp_path, test_resources_dir, test_api_key