mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-02 11:49:23 +00:00
fix(ingestion/grafana): stabilize integration tests (#14547)
This commit is contained in:
parent
0add552473
commit
88842566cf
@ -14,12 +14,19 @@ services:
|
||||
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
|
||||
- GF_FEATURE_TOGGLES_ENABLE=publicDashboards
|
||||
# Improve startup stability in CI
|
||||
- GF_LOG_LEVEL=info
|
||||
- GF_SERVER_HTTP_PORT=3000
|
||||
- GF_DATABASE_WAL=true
|
||||
# Reduce resource usage for CI
|
||||
- GF_ANALYTICS_REPORTING_ENABLED=false
|
||||
- GF_ANALYTICS_CHECK_FOR_UPDATES=false
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 6
|
||||
start_period: 30s
|
||||
test: ["CMD-SHELL", "wget --no-verbose --tries=3 --timeout=10 --spider http://localhost:3000/api/health && wget --no-verbose --tries=1 --timeout=5 --spider http://localhost:3000/api/org || exit 1"]
|
||||
interval: 15s
|
||||
timeout: 10s
|
||||
retries: 8
|
||||
start_period: 45s
|
||||
volumes:
|
||||
- grafana-storage:/var/lib/grafana
|
||||
- ./provisioning:/etc/grafana/provisioning
|
||||
@ -39,11 +46,13 @@ services:
|
||||
POSTGRES_DB: grafana
|
||||
POSTGRES_USER: grafana
|
||||
POSTGRES_PASSWORD: grafana
|
||||
POSTGRES_INITDB_ARGS: "--data-checksums"
|
||||
healthcheck:
|
||||
test: [ "CMD-SHELL", "pg_isready -U grafana -d grafana" ]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
timeout: 10s
|
||||
retries: 8
|
||||
start_period: 30s
|
||||
volumes:
|
||||
- postgres-storage:/var/lib/postgresql/data
|
||||
- ./postgres-init:/docker-entrypoint-initdb.d
|
||||
@ -53,6 +62,8 @@ services:
|
||||
prometheus:
|
||||
image: prom/prometheus:latest
|
||||
container_name: prometheus
|
||||
ports:
|
||||
- "9090:9090"
|
||||
volumes:
|
||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||
command:
|
||||
@ -60,6 +71,8 @@ services:
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||
- '--web.enable-lifecycle'
|
||||
|
||||
networks:
|
||||
- grafana-network
|
||||
|
||||
|
||||
@ -6,11 +6,13 @@ import pytest
|
||||
import pytest_docker.plugin
|
||||
import requests
|
||||
from freezegun import freeze_time
|
||||
from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from datahub.ingestion.run.pipeline import Pipeline
|
||||
from datahub.testing import mce_helpers
|
||||
from tests.test_helpers import fs_helpers
|
||||
from tests.test_helpers.docker_helpers import cleanup_image
|
||||
from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port
|
||||
|
||||
pytestmark = pytest.mark.integration_batch_2
|
||||
|
||||
@ -27,36 +29,78 @@ class GrafanaClient:
|
||||
"Authorization": f"Basic {b64encode(f'{admin_user}:{admin_password}'.encode()).decode()}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
self.session = requests.Session()
|
||||
retry_strategy = Retry(
|
||||
total=5,
|
||||
backoff_factor=2,
|
||||
status_forcelist=[500, 502, 503, 504, 429],
|
||||
allowed_methods=["GET", "POST"],
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
self.session.mount("http://", adapter)
|
||||
self.session.mount("https://", adapter)
|
||||
|
||||
def create_service_account(self, name, role):
|
||||
def create_service_account(self, name, role, max_retries=5):
|
||||
service_account_payload = {"name": name, "role": role, "isDisabled": False}
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.url}/api/serviceaccounts",
|
||||
headers=self.headers,
|
||||
json=service_account_payload,
|
||||
)
|
||||
response.raise_for_status()
|
||||
service_account = response.json()
|
||||
return service_account
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Error creating service account: {e}")
|
||||
return None
|
||||
|
||||
def create_api_key(self, service_account_id, key_name, role):
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = self.session.post(
|
||||
f"{self.url}/api/serviceaccounts",
|
||||
headers=self.headers,
|
||||
json=service_account_payload,
|
||||
timeout=15,
|
||||
)
|
||||
response.raise_for_status()
|
||||
service_account = response.json()
|
||||
logging.info(
|
||||
f"Successfully created service account '{name}' on attempt {attempt + 1}"
|
||||
)
|
||||
return service_account
|
||||
except requests.exceptions.RequestException as e:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2**attempt # Exponential backoff
|
||||
logging.warning(
|
||||
f"Attempt {attempt + 1} failed to create service account: {e}. Retrying in {wait_time}s..."
|
||||
)
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
logging.error(
|
||||
f"Failed to create service account after {max_retries} attempts: {e}"
|
||||
)
|
||||
return None
|
||||
return None
|
||||
|
||||
def create_api_key(self, service_account_id, key_name, role, max_retries=5):
|
||||
api_key_payload = {"name": key_name, "role": role}
|
||||
try:
|
||||
response = requests.post(
|
||||
f"{self.url}/api/serviceaccounts/{service_account_id}/tokens",
|
||||
headers=self.headers,
|
||||
json=api_key_payload,
|
||||
)
|
||||
response.raise_for_status()
|
||||
api_key = response.json()
|
||||
return api_key["key"]
|
||||
except requests.exceptions.RequestException as e:
|
||||
logging.error(f"Error creating API key: {e}")
|
||||
return None
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = self.session.post(
|
||||
f"{self.url}/api/serviceaccounts/{service_account_id}/tokens",
|
||||
headers=self.headers,
|
||||
json=api_key_payload,
|
||||
timeout=15,
|
||||
)
|
||||
response.raise_for_status()
|
||||
api_key = response.json()
|
||||
logging.info(
|
||||
f"Successfully created API key '{key_name}' on attempt {attempt + 1}"
|
||||
)
|
||||
return api_key["key"]
|
||||
except requests.exceptions.RequestException as e:
|
||||
if attempt < max_retries - 1:
|
||||
wait_time = 2**attempt # Exponential backoff
|
||||
logging.warning(
|
||||
f"Attempt {attempt + 1} failed to create API key: {e}. Retrying in {wait_time}s..."
|
||||
)
|
||||
time.sleep(wait_time)
|
||||
else:
|
||||
logging.error(
|
||||
f"Failed to create API key after {max_retries} attempts: {e}"
|
||||
)
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@ -67,11 +111,14 @@ def test_resources_dir(pytestconfig):
|
||||
@pytest.fixture(scope="module")
|
||||
def test_api_key(loaded_grafana):
|
||||
# Get the actual mapped port from Docker services
|
||||
grafana_port = loaded_grafana.port_for("grafana", 3000)
|
||||
url = f"http://localhost:{grafana_port}"
|
||||
|
||||
url = "http://localhost:3000"
|
||||
admin_user = "admin"
|
||||
admin_password = "admin"
|
||||
|
||||
# Wait for Grafana to be fully ready before creating service account
|
||||
verify_grafana_fully_ready(loaded_grafana, timeout=180)
|
||||
|
||||
grafana_client = GrafanaClient(url, admin_user, admin_password)
|
||||
|
||||
service_account = grafana_client.create_service_account(
|
||||
@ -96,8 +143,30 @@ def loaded_grafana(docker_compose_runner, test_resources_dir):
|
||||
with docker_compose_runner(
|
||||
test_resources_dir / "docker-compose.yml", "grafana"
|
||||
) as docker_services:
|
||||
# Docker Compose now waits for health check to pass before considering service ready
|
||||
# Verify we can access the API endpoints as an additional safety check
|
||||
# Wait for all services to be ready
|
||||
wait_for_port(docker_services, "postgres", 5432, timeout=90)
|
||||
|
||||
# Prometheus container doesn't have bash, so use a simple HTTP check
|
||||
def check_prometheus_ready():
|
||||
try:
|
||||
prometheus_port = docker_services.port_for("prometheus", 9090)
|
||||
response = requests.get(
|
||||
f"http://localhost:{prometheus_port}/-/ready", timeout=5
|
||||
)
|
||||
return response.status_code == 200
|
||||
except (requests.exceptions.RequestException, Exception):
|
||||
return False
|
||||
|
||||
wait_for_port(
|
||||
docker_services,
|
||||
"prometheus",
|
||||
9090,
|
||||
timeout=90,
|
||||
checker=check_prometheus_ready,
|
||||
)
|
||||
wait_for_port(docker_services, "grafana", 3000, timeout=180)
|
||||
|
||||
# Additional verification that Grafana API is fully accessible
|
||||
verify_grafana_api_ready(docker_services)
|
||||
yield docker_services
|
||||
|
||||
@ -106,24 +175,29 @@ def loaded_grafana(docker_compose_runner, test_resources_dir):
|
||||
|
||||
def verify_grafana_api_ready(docker_services: pytest_docker.plugin.Services) -> None:
|
||||
"""Robust verification that Grafana API is fully accessible after health check passes"""
|
||||
import requests
|
||||
|
||||
grafana_port = docker_services.port_for("grafana", 3000)
|
||||
base_url = f"http://localhost:{grafana_port}"
|
||||
base_url = "http://localhost:3000"
|
||||
|
||||
# Configure requests session with retries
|
||||
session = requests.Session()
|
||||
retry_strategy = Retry(
|
||||
total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504]
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
session.mount("http://", adapter)
|
||||
|
||||
# Wait for API endpoints to be fully ready (health check might pass but API still initializing)
|
||||
max_attempts = 30
|
||||
max_attempts = 60
|
||||
for attempt in range(max_attempts):
|
||||
try:
|
||||
# Test both basic API access and service account creation capability
|
||||
api_url = f"{base_url}/api/search"
|
||||
resp = requests.get(api_url, auth=("admin", "admin"), timeout=10)
|
||||
resp = session.get(api_url, auth=("admin", "admin"), timeout=15)
|
||||
|
||||
if resp.status_code == 200:
|
||||
# Also verify service account API is ready (needed for test_api_key fixture)
|
||||
# Service accounts might not be available in all Grafana versions
|
||||
sa_url = f"{base_url}/api/serviceaccounts"
|
||||
sa_resp = requests.get(sa_url, auth=("admin", "admin"), timeout=10)
|
||||
sa_resp = session.get(sa_url, auth=("admin", "admin"), timeout=15)
|
||||
|
||||
if sa_resp.status_code == 200:
|
||||
logging.info(
|
||||
@ -147,12 +221,58 @@ def verify_grafana_api_ready(docker_services: pytest_docker.plugin.Services) ->
|
||||
logging.debug(f"API readiness check failed (attempt {attempt + 1}): {e}")
|
||||
|
||||
if attempt < max_attempts - 1:
|
||||
time.sleep(2)
|
||||
time.sleep(3)
|
||||
|
||||
logging.warning(f"Grafana API may not be fully ready after {max_attempts} attempts")
|
||||
# Don't fail here - let the test proceed and provide better error info if needed
|
||||
|
||||
|
||||
def verify_grafana_fully_ready(
|
||||
docker_services: pytest_docker.plugin.Services, timeout: int = 120
|
||||
) -> None:
|
||||
"""Extended verification that Grafana is fully ready for service account operations"""
|
||||
base_url = "http://localhost:3000"
|
||||
|
||||
session = requests.Session()
|
||||
retry_strategy = Retry(
|
||||
total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504]
|
||||
)
|
||||
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||
session.mount("http://", adapter)
|
||||
|
||||
end_time = time.time() + timeout
|
||||
|
||||
while time.time() < end_time:
|
||||
try:
|
||||
# Test multiple endpoints to ensure full readiness
|
||||
endpoints_to_check = [
|
||||
f"{base_url}/api/health",
|
||||
f"{base_url}/api/org",
|
||||
f"{base_url}/api/serviceaccounts",
|
||||
]
|
||||
|
||||
all_ready = True
|
||||
for endpoint in endpoints_to_check:
|
||||
resp = session.get(endpoint, auth=("admin", "admin"), timeout=10)
|
||||
if resp.status_code not in [
|
||||
200,
|
||||
404,
|
||||
]: # 404 is OK for service accounts in older versions
|
||||
all_ready = False
|
||||
break
|
||||
|
||||
if all_ready:
|
||||
logging.info("Grafana is fully ready for operations")
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
logging.debug(f"Grafana readiness check failed: {e}")
|
||||
|
||||
time.sleep(2)
|
||||
|
||||
logging.warning(f"Grafana may not be fully ready after {timeout}s timeout")
|
||||
|
||||
|
||||
@freeze_time(FROZEN_TIME)
|
||||
def test_grafana_basic_ingest(
|
||||
loaded_grafana, pytestconfig, tmp_path, test_resources_dir, test_api_key
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user