mirror of
https://github.com/datahub-project/datahub.git
synced 2025-11-11 17:03:33 +00:00
fix(ingestion/grafana): stabilize integration tests (#14547)
This commit is contained in:
parent
0add552473
commit
88842566cf
@ -14,12 +14,19 @@ services:
|
|||||||
- GF_AUTH_ANONYMOUS_ENABLED=true
|
- GF_AUTH_ANONYMOUS_ENABLED=true
|
||||||
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
|
- GF_AUTH_ANONYMOUS_ORG_ROLE=Viewer
|
||||||
- GF_FEATURE_TOGGLES_ENABLE=publicDashboards
|
- GF_FEATURE_TOGGLES_ENABLE=publicDashboards
|
||||||
|
# Improve startup stability in CI
|
||||||
|
- GF_LOG_LEVEL=info
|
||||||
|
- GF_SERVER_HTTP_PORT=3000
|
||||||
|
- GF_DATABASE_WAL=true
|
||||||
|
# Reduce resource usage for CI
|
||||||
|
- GF_ANALYTICS_REPORTING_ENABLED=false
|
||||||
|
- GF_ANALYTICS_CHECK_FOR_UPDATES=false
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "wget --no-verbose --tries=1 --spider http://localhost:3000/api/health || exit 1"]
|
test: ["CMD-SHELL", "wget --no-verbose --tries=3 --timeout=10 --spider http://localhost:3000/api/health && wget --no-verbose --tries=1 --timeout=5 --spider http://localhost:3000/api/org || exit 1"]
|
||||||
interval: 10s
|
interval: 15s
|
||||||
timeout: 5s
|
timeout: 10s
|
||||||
retries: 6
|
retries: 8
|
||||||
start_period: 30s
|
start_period: 45s
|
||||||
volumes:
|
volumes:
|
||||||
- grafana-storage:/var/lib/grafana
|
- grafana-storage:/var/lib/grafana
|
||||||
- ./provisioning:/etc/grafana/provisioning
|
- ./provisioning:/etc/grafana/provisioning
|
||||||
@ -39,11 +46,13 @@ services:
|
|||||||
POSTGRES_DB: grafana
|
POSTGRES_DB: grafana
|
||||||
POSTGRES_USER: grafana
|
POSTGRES_USER: grafana
|
||||||
POSTGRES_PASSWORD: grafana
|
POSTGRES_PASSWORD: grafana
|
||||||
|
POSTGRES_INITDB_ARGS: "--data-checksums"
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: [ "CMD-SHELL", "pg_isready -U grafana -d grafana" ]
|
test: [ "CMD-SHELL", "pg_isready -U grafana -d grafana" ]
|
||||||
interval: 10s
|
interval: 10s
|
||||||
timeout: 5s
|
timeout: 10s
|
||||||
retries: 5
|
retries: 8
|
||||||
|
start_period: 30s
|
||||||
volumes:
|
volumes:
|
||||||
- postgres-storage:/var/lib/postgresql/data
|
- postgres-storage:/var/lib/postgresql/data
|
||||||
- ./postgres-init:/docker-entrypoint-initdb.d
|
- ./postgres-init:/docker-entrypoint-initdb.d
|
||||||
@ -53,6 +62,8 @@ services:
|
|||||||
prometheus:
|
prometheus:
|
||||||
image: prom/prometheus:latest
|
image: prom/prometheus:latest
|
||||||
container_name: prometheus
|
container_name: prometheus
|
||||||
|
ports:
|
||||||
|
- "9090:9090"
|
||||||
volumes:
|
volumes:
|
||||||
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
- ./prometheus.yml:/etc/prometheus/prometheus.yml
|
||||||
command:
|
command:
|
||||||
@ -60,6 +71,8 @@ services:
|
|||||||
- '--storage.tsdb.path=/prometheus'
|
- '--storage.tsdb.path=/prometheus'
|
||||||
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
|
||||||
- '--web.console.templates=/usr/share/prometheus/consoles'
|
- '--web.console.templates=/usr/share/prometheus/consoles'
|
||||||
|
- '--web.enable-lifecycle'
|
||||||
|
|
||||||
networks:
|
networks:
|
||||||
- grafana-network
|
- grafana-network
|
||||||
|
|
||||||
|
|||||||
@ -6,11 +6,13 @@ import pytest
|
|||||||
import pytest_docker.plugin
|
import pytest_docker.plugin
|
||||||
import requests
|
import requests
|
||||||
from freezegun import freeze_time
|
from freezegun import freeze_time
|
||||||
|
from requests.adapters import HTTPAdapter
|
||||||
|
from urllib3.util.retry import Retry
|
||||||
|
|
||||||
from datahub.ingestion.run.pipeline import Pipeline
|
from datahub.ingestion.run.pipeline import Pipeline
|
||||||
from datahub.testing import mce_helpers
|
from datahub.testing import mce_helpers
|
||||||
from tests.test_helpers import fs_helpers
|
from tests.test_helpers import fs_helpers
|
||||||
from tests.test_helpers.docker_helpers import cleanup_image
|
from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port
|
||||||
|
|
||||||
pytestmark = pytest.mark.integration_batch_2
|
pytestmark = pytest.mark.integration_batch_2
|
||||||
|
|
||||||
@ -27,35 +29,77 @@ class GrafanaClient:
|
|||||||
"Authorization": f"Basic {b64encode(f'{admin_user}:{admin_password}'.encode()).decode()}",
|
"Authorization": f"Basic {b64encode(f'{admin_user}:{admin_password}'.encode()).decode()}",
|
||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
}
|
}
|
||||||
|
self.session = requests.Session()
|
||||||
|
retry_strategy = Retry(
|
||||||
|
total=5,
|
||||||
|
backoff_factor=2,
|
||||||
|
status_forcelist=[500, 502, 503, 504, 429],
|
||||||
|
allowed_methods=["GET", "POST"],
|
||||||
|
)
|
||||||
|
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||||
|
self.session.mount("http://", adapter)
|
||||||
|
self.session.mount("https://", adapter)
|
||||||
|
|
||||||
def create_service_account(self, name, role):
|
def create_service_account(self, name, role, max_retries=5):
|
||||||
service_account_payload = {"name": name, "role": role, "isDisabled": False}
|
service_account_payload = {"name": name, "role": role, "isDisabled": False}
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
response = requests.post(
|
response = self.session.post(
|
||||||
f"{self.url}/api/serviceaccounts",
|
f"{self.url}/api/serviceaccounts",
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
json=service_account_payload,
|
json=service_account_payload,
|
||||||
|
timeout=15,
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
service_account = response.json()
|
service_account = response.json()
|
||||||
|
logging.info(
|
||||||
|
f"Successfully created service account '{name}' on attempt {attempt + 1}"
|
||||||
|
)
|
||||||
return service_account
|
return service_account
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
logging.error(f"Error creating service account: {e}")
|
if attempt < max_retries - 1:
|
||||||
|
wait_time = 2**attempt # Exponential backoff
|
||||||
|
logging.warning(
|
||||||
|
f"Attempt {attempt + 1} failed to create service account: {e}. Retrying in {wait_time}s..."
|
||||||
|
)
|
||||||
|
time.sleep(wait_time)
|
||||||
|
else:
|
||||||
|
logging.error(
|
||||||
|
f"Failed to create service account after {max_retries} attempts: {e}"
|
||||||
|
)
|
||||||
|
return None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def create_api_key(self, service_account_id, key_name, role):
|
def create_api_key(self, service_account_id, key_name, role, max_retries=5):
|
||||||
api_key_payload = {"name": key_name, "role": role}
|
api_key_payload = {"name": key_name, "role": role}
|
||||||
|
|
||||||
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
response = requests.post(
|
response = self.session.post(
|
||||||
f"{self.url}/api/serviceaccounts/{service_account_id}/tokens",
|
f"{self.url}/api/serviceaccounts/{service_account_id}/tokens",
|
||||||
headers=self.headers,
|
headers=self.headers,
|
||||||
json=api_key_payload,
|
json=api_key_payload,
|
||||||
|
timeout=15,
|
||||||
)
|
)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
api_key = response.json()
|
api_key = response.json()
|
||||||
|
logging.info(
|
||||||
|
f"Successfully created API key '{key_name}' on attempt {attempt + 1}"
|
||||||
|
)
|
||||||
return api_key["key"]
|
return api_key["key"]
|
||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
logging.error(f"Error creating API key: {e}")
|
if attempt < max_retries - 1:
|
||||||
|
wait_time = 2**attempt # Exponential backoff
|
||||||
|
logging.warning(
|
||||||
|
f"Attempt {attempt + 1} failed to create API key: {e}. Retrying in {wait_time}s..."
|
||||||
|
)
|
||||||
|
time.sleep(wait_time)
|
||||||
|
else:
|
||||||
|
logging.error(
|
||||||
|
f"Failed to create API key after {max_retries} attempts: {e}"
|
||||||
|
)
|
||||||
|
return None
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@ -67,11 +111,14 @@ def test_resources_dir(pytestconfig):
|
|||||||
@pytest.fixture(scope="module")
|
@pytest.fixture(scope="module")
|
||||||
def test_api_key(loaded_grafana):
|
def test_api_key(loaded_grafana):
|
||||||
# Get the actual mapped port from Docker services
|
# Get the actual mapped port from Docker services
|
||||||
grafana_port = loaded_grafana.port_for("grafana", 3000)
|
|
||||||
url = f"http://localhost:{grafana_port}"
|
url = "http://localhost:3000"
|
||||||
admin_user = "admin"
|
admin_user = "admin"
|
||||||
admin_password = "admin"
|
admin_password = "admin"
|
||||||
|
|
||||||
|
# Wait for Grafana to be fully ready before creating service account
|
||||||
|
verify_grafana_fully_ready(loaded_grafana, timeout=180)
|
||||||
|
|
||||||
grafana_client = GrafanaClient(url, admin_user, admin_password)
|
grafana_client = GrafanaClient(url, admin_user, admin_password)
|
||||||
|
|
||||||
service_account = grafana_client.create_service_account(
|
service_account = grafana_client.create_service_account(
|
||||||
@ -96,8 +143,30 @@ def loaded_grafana(docker_compose_runner, test_resources_dir):
|
|||||||
with docker_compose_runner(
|
with docker_compose_runner(
|
||||||
test_resources_dir / "docker-compose.yml", "grafana"
|
test_resources_dir / "docker-compose.yml", "grafana"
|
||||||
) as docker_services:
|
) as docker_services:
|
||||||
# Docker Compose now waits for health check to pass before considering service ready
|
# Wait for all services to be ready
|
||||||
# Verify we can access the API endpoints as an additional safety check
|
wait_for_port(docker_services, "postgres", 5432, timeout=90)
|
||||||
|
|
||||||
|
# Prometheus container doesn't have bash, so use a simple HTTP check
|
||||||
|
def check_prometheus_ready():
|
||||||
|
try:
|
||||||
|
prometheus_port = docker_services.port_for("prometheus", 9090)
|
||||||
|
response = requests.get(
|
||||||
|
f"http://localhost:{prometheus_port}/-/ready", timeout=5
|
||||||
|
)
|
||||||
|
return response.status_code == 200
|
||||||
|
except (requests.exceptions.RequestException, Exception):
|
||||||
|
return False
|
||||||
|
|
||||||
|
wait_for_port(
|
||||||
|
docker_services,
|
||||||
|
"prometheus",
|
||||||
|
9090,
|
||||||
|
timeout=90,
|
||||||
|
checker=check_prometheus_ready,
|
||||||
|
)
|
||||||
|
wait_for_port(docker_services, "grafana", 3000, timeout=180)
|
||||||
|
|
||||||
|
# Additional verification that Grafana API is fully accessible
|
||||||
verify_grafana_api_ready(docker_services)
|
verify_grafana_api_ready(docker_services)
|
||||||
yield docker_services
|
yield docker_services
|
||||||
|
|
||||||
@ -106,24 +175,29 @@ def loaded_grafana(docker_compose_runner, test_resources_dir):
|
|||||||
|
|
||||||
def verify_grafana_api_ready(docker_services: pytest_docker.plugin.Services) -> None:
|
def verify_grafana_api_ready(docker_services: pytest_docker.plugin.Services) -> None:
|
||||||
"""Robust verification that Grafana API is fully accessible after health check passes"""
|
"""Robust verification that Grafana API is fully accessible after health check passes"""
|
||||||
import requests
|
|
||||||
|
|
||||||
grafana_port = docker_services.port_for("grafana", 3000)
|
base_url = "http://localhost:3000"
|
||||||
base_url = f"http://localhost:{grafana_port}"
|
|
||||||
|
# Configure requests session with retries
|
||||||
|
session = requests.Session()
|
||||||
|
retry_strategy = Retry(
|
||||||
|
total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504]
|
||||||
|
)
|
||||||
|
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||||
|
session.mount("http://", adapter)
|
||||||
|
|
||||||
# Wait for API endpoints to be fully ready (health check might pass but API still initializing)
|
# Wait for API endpoints to be fully ready (health check might pass but API still initializing)
|
||||||
max_attempts = 30
|
max_attempts = 60
|
||||||
for attempt in range(max_attempts):
|
for attempt in range(max_attempts):
|
||||||
try:
|
try:
|
||||||
# Test both basic API access and service account creation capability
|
# Test both basic API access and service account creation capability
|
||||||
api_url = f"{base_url}/api/search"
|
api_url = f"{base_url}/api/search"
|
||||||
resp = requests.get(api_url, auth=("admin", "admin"), timeout=10)
|
resp = session.get(api_url, auth=("admin", "admin"), timeout=15)
|
||||||
|
|
||||||
if resp.status_code == 200:
|
if resp.status_code == 200:
|
||||||
# Also verify service account API is ready (needed for test_api_key fixture)
|
# Also verify service account API is ready (needed for test_api_key fixture)
|
||||||
# Service accounts might not be available in all Grafana versions
|
|
||||||
sa_url = f"{base_url}/api/serviceaccounts"
|
sa_url = f"{base_url}/api/serviceaccounts"
|
||||||
sa_resp = requests.get(sa_url, auth=("admin", "admin"), timeout=10)
|
sa_resp = session.get(sa_url, auth=("admin", "admin"), timeout=15)
|
||||||
|
|
||||||
if sa_resp.status_code == 200:
|
if sa_resp.status_code == 200:
|
||||||
logging.info(
|
logging.info(
|
||||||
@ -147,12 +221,58 @@ def verify_grafana_api_ready(docker_services: pytest_docker.plugin.Services) ->
|
|||||||
logging.debug(f"API readiness check failed (attempt {attempt + 1}): {e}")
|
logging.debug(f"API readiness check failed (attempt {attempt + 1}): {e}")
|
||||||
|
|
||||||
if attempt < max_attempts - 1:
|
if attempt < max_attempts - 1:
|
||||||
time.sleep(2)
|
time.sleep(3)
|
||||||
|
|
||||||
logging.warning(f"Grafana API may not be fully ready after {max_attempts} attempts")
|
logging.warning(f"Grafana API may not be fully ready after {max_attempts} attempts")
|
||||||
# Don't fail here - let the test proceed and provide better error info if needed
|
# Don't fail here - let the test proceed and provide better error info if needed
|
||||||
|
|
||||||
|
|
||||||
|
def verify_grafana_fully_ready(
|
||||||
|
docker_services: pytest_docker.plugin.Services, timeout: int = 120
|
||||||
|
) -> None:
|
||||||
|
"""Extended verification that Grafana is fully ready for service account operations"""
|
||||||
|
base_url = "http://localhost:3000"
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
retry_strategy = Retry(
|
||||||
|
total=3, backoff_factor=1, status_forcelist=[500, 502, 503, 504]
|
||||||
|
)
|
||||||
|
adapter = HTTPAdapter(max_retries=retry_strategy)
|
||||||
|
session.mount("http://", adapter)
|
||||||
|
|
||||||
|
end_time = time.time() + timeout
|
||||||
|
|
||||||
|
while time.time() < end_time:
|
||||||
|
try:
|
||||||
|
# Test multiple endpoints to ensure full readiness
|
||||||
|
endpoints_to_check = [
|
||||||
|
f"{base_url}/api/health",
|
||||||
|
f"{base_url}/api/org",
|
||||||
|
f"{base_url}/api/serviceaccounts",
|
||||||
|
]
|
||||||
|
|
||||||
|
all_ready = True
|
||||||
|
for endpoint in endpoints_to_check:
|
||||||
|
resp = session.get(endpoint, auth=("admin", "admin"), timeout=10)
|
||||||
|
if resp.status_code not in [
|
||||||
|
200,
|
||||||
|
404,
|
||||||
|
]: # 404 is OK for service accounts in older versions
|
||||||
|
all_ready = False
|
||||||
|
break
|
||||||
|
|
||||||
|
if all_ready:
|
||||||
|
logging.info("Grafana is fully ready for operations")
|
||||||
|
return
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logging.debug(f"Grafana readiness check failed: {e}")
|
||||||
|
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
logging.warning(f"Grafana may not be fully ready after {timeout}s timeout")
|
||||||
|
|
||||||
|
|
||||||
@freeze_time(FROZEN_TIME)
|
@freeze_time(FROZEN_TIME)
|
||||||
def test_grafana_basic_ingest(
|
def test_grafana_basic_ingest(
|
||||||
loaded_grafana, pytestconfig, tmp_path, test_resources_dir, test_api_key
|
loaded_grafana, pytestconfig, tmp_path, test_resources_dir, test_api_key
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user