datahub/smoke-test/tests/cli/dataset_cmd/test_dataset_command.py

298 lines
10 KiB
Python

import json
import logging
import os
import tempfile
from pathlib import Path
from random import randint
import pytest
import yaml
from click.testing import CliRunner
from datahub.api.entities.dataset.dataset import Dataset
from datahub.emitter.mce_builder import make_dataset_urn
from datahub.entrypoints import datahub
from datahub.ingestion.graph.client import DataHubGraph
from tests.consistency_utils import wait_for_writes_to_sync
from tests.utils import delete_urns, get_sleep_info
logger = logging.getLogger(__name__)
# Generate random dataset IDs to avoid test interference
start_index = randint(10, 10000)
dataset_id = f"test_dataset_sync_{start_index}"
dataset_urn = make_dataset_urn("snowflake", dataset_id)
sleep_sec, sleep_times = get_sleep_info()
runner = CliRunner()
@pytest.fixture(scope="module")
def setup_teardown_dataset(graph_client: DataHubGraph):
"""Fixture to setup and teardown the test dataset"""
# Setup: Create empty environment
try:
# Teardown any existing dataset first to ensure clean state
if graph_client.exists(dataset_urn):
delete_urns(graph_client, [dataset_urn])
wait_for_writes_to_sync()
yield
finally:
# Teardown: Clean up created dataset
if graph_client.exists(dataset_urn):
delete_urns(graph_client, [dataset_urn])
wait_for_writes_to_sync()
def create_dataset_yaml(file_path: Path, additional_properties=None):
"""Create a test dataset YAML file"""
dataset_yaml = {
"id": dataset_id,
"platform": "snowflake",
"env": "PROD",
"description": "Test dataset for CLI sync smoke test",
"schema": {
"fields": [
{"id": "id", "type": "number", "description": "Primary key"},
{"id": "name", "type": "string", "description": "User name"},
]
},
"tags": ["test", "smoke_test"],
"properties": {"origin": "cli_test"},
}
# Add any additional properties
if additional_properties:
for key, value in additional_properties.items():
dataset_yaml[key] = value
# Write YAML file
with open(file_path, "w") as f:
f.write("# Dataset sync test\n") # Add a comment to test comment preservation
yaml.dump(dataset_yaml, f, indent=2)
def run_cli_command(cmd, auth_session):
"""Run a DataHub CLI command using CliRunner and auth_session"""
args = cmd.split()
result = runner.invoke(
datahub,
args,
env={
"DATAHUB_GMS_URL": auth_session.gms_url(),
"DATAHUB_GMS_TOKEN": auth_session.gms_token(),
},
)
if result.exit_code != 0:
logger.error(f"Command failed: {cmd}")
logger.error(f"STDOUT: {result.stdout}")
logger.error(f"STDERR: {result.stderr}")
raise Exception(f"Command failed with return code {result.exit_code}")
return result
def test_dataset_sync_to_datahub(
setup_teardown_dataset, graph_client: DataHubGraph, auth_session
):
"""Test syncing dataset from YAML to DataHub"""
with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as tmp:
temp_file_path = Path(tmp.name)
try:
# Create a dataset YAML file
create_dataset_yaml(temp_file_path)
# Run the CLI command to sync to DataHub
cmd = f"dataset sync -f {temp_file_path} --to-datahub"
result = run_cli_command(cmd, auth_session)
# Verify success message in output
assert f"Update succeeded for urn {dataset_urn}" in result.stdout
# Wait for changes to propagate
wait_for_writes_to_sync()
# Verify the dataset exists in DataHub
assert graph_client.exists(dataset_urn)
# Retrieve dataset and verify properties
dataset = Dataset.from_datahub(graph=graph_client, urn=dataset_urn)
assert dataset.id == dataset_id
assert dataset.platform == "snowflake"
assert dataset.description == "Test dataset for CLI sync smoke test"
assert dataset.tags == ["test", "smoke_test"]
assert dataset.properties is not None
assert dataset.properties["origin"] == "cli_test"
finally:
# Clean up temporary file
if temp_file_path.exists():
os.unlink(temp_file_path)
def test_dataset_sync_from_datahub(
setup_teardown_dataset, graph_client: DataHubGraph, auth_session
):
"""Test syncing dataset from DataHub to YAML"""
with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as tmp:
temp_file_path = Path(tmp.name)
try:
# First, create a dataset in DataHub
dataset = Dataset(
id=dataset_id,
platform="snowflake",
description="Test dataset created directly in DataHub",
tags=["from_datahub", "cli_test"],
properties={"origin": "direct_creation"},
schema=None,
)
# Emit the dataset to DataHub
for mcp in dataset.generate_mcp():
graph_client.emit(mcp)
wait_for_writes_to_sync()
# Create a minimal dataset YAML as a starting point
create_dataset_yaml(
temp_file_path, {"description": "This will be overwritten"}
)
# Run the CLI command to sync from DataHub
cmd = f"dataset sync -f {temp_file_path} --from-datahub"
result = run_cli_command(cmd, auth_session)
assert result.exit_code == 0
# Wait to ensure file is updated
wait_for_writes_to_sync()
# Verify the YAML file was updated
with open(temp_file_path, "r") as f:
content = f.read()
# Check for the comment preservation
assert "# Dataset sync test" in content
# Check for content from DataHub
assert "Test dataset created directly in DataHub" in content
assert "from_datahub" in content
assert "direct_creation" in content
finally:
# Clean up temporary file
if temp_file_path.exists():
os.unlink(temp_file_path)
def test_dataset_sync_bidirectional(
setup_teardown_dataset, graph_client: DataHubGraph, auth_session
):
"""Test bidirectional sync with modifications on both sides"""
with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as tmp:
temp_file_path = Path(tmp.name)
try:
# 1. Create initial dataset in YAML
create_dataset_yaml(temp_file_path)
# 2. Sync to DataHub
run_cli_command(
f"dataset sync -f {temp_file_path} --to-datahub", auth_session
)
wait_for_writes_to_sync()
# 3. Modify directly in DataHub
dataset_patcher = Dataset.get_patch_builder(dataset_urn)
dataset_patcher.set_description("Modified directly in DataHub")
for mcp in dataset_patcher.build():
graph_client.emit(mcp)
wait_for_writes_to_sync()
# 4. Sync from DataHub to update YAML
run_cli_command(
f"dataset sync -f {temp_file_path} --from-datahub", auth_session
)
# 5. Modify the YAML file directly
with open(temp_file_path, "r") as f:
import yaml
data = yaml.safe_load(f)
data["properties"]["modified_by"] = "cli_test"
data["tags"].append("modified_yaml")
with open(temp_file_path, "w") as f:
f.write("# Modified comment\n")
json.dump(data, f, indent=2)
# 6. Sync back to DataHub
run_cli_command(
f"dataset sync -f {temp_file_path} --to-datahub", auth_session
)
wait_for_writes_to_sync()
# 7. Verify both modifications are present in DataHub
final_dataset = Dataset.from_datahub(graph=graph_client, urn=dataset_urn)
assert final_dataset.description == "Modified directly in DataHub"
assert final_dataset.properties is not None
assert final_dataset.properties["modified_by"] == "cli_test"
assert final_dataset.tags is not None
assert "modified_yaml" in final_dataset.tags
# 8. Sync one more time from DataHub and verify YAML is intact
run_cli_command(
f"dataset sync -f {temp_file_path} --from-datahub", auth_session
)
with open(temp_file_path, "r") as f:
content = f.read()
assert "# Modified comment" in content
finally:
# Clean up temporary file
if temp_file_path.exists():
os.unlink(temp_file_path)
def test_dataset_sync_validation(
setup_teardown_dataset, graph_client: DataHubGraph, auth_session
):
"""Test validation during sync to DataHub with invalid references"""
with tempfile.NamedTemporaryFile(suffix=".yml", delete=False) as tmp:
temp_file_path = Path(tmp.name)
try:
# Create dataset with invalid structured property reference
create_dataset_yaml(
temp_file_path,
{
"structured_properties": {"non_existent_property": "value"},
"schema": {
"fields": [
{
"id": "field1",
"type": "string",
"structured_properties": {
"non_existent_field_property": "value"
},
}
]
},
},
)
# Attempt to sync to DataHub - should fail due to validation
cmd = f"dataset sync -f {temp_file_path} --to-datahub"
try:
run_cli_command(cmd, auth_session)
# If we get here, the command didn't fail as expected
raise AssertionError("Command should have failed due to validation")
except Exception as e:
if not isinstance(e, AssertionError):
# Command failed as expected
pass
else:
raise e
finally:
# Clean up temporary file
if temp_file_path.exists():
os.unlink(temp_file_path)