mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-16 04:33:12 +00:00
195 lines
7.7 KiB
Python
195 lines
7.7 KiB
Python
"""
|
|
Unit tests for fivetran_constants.py
|
|
|
|
Tests focus on business logic and edge cases that matter for connector functionality.
|
|
"""
|
|
|
|
import pytest
|
|
|
|
from datahub.ingestion.source.fivetran.fivetran_constants import (
|
|
DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR,
|
|
MAX_COLUMN_LINEAGE_PER_CONNECTOR,
|
|
MAX_JOBS_PER_CONNECTOR,
|
|
DataJobMode,
|
|
FivetranMode,
|
|
get_platform_from_fivetran_service,
|
|
)
|
|
|
|
|
|
class TestPlatformMapping:
|
|
"""Test platform mapping logic that affects connector behavior."""
|
|
|
|
@pytest.mark.parametrize(
|
|
"fivetran_service,expected_datahub_platform",
|
|
[
|
|
# Critical pattern matching for SQL Server variants
|
|
("sql_server", "mssql"),
|
|
("sql_server_rds", "mssql"),
|
|
("azure_sql", "mssql"),
|
|
("azure_sql_database", "mssql"),
|
|
# PostgreSQL variants that must map correctly
|
|
("postgres", "postgres"),
|
|
("postgresql", "postgres"),
|
|
("postgres_rds", "postgres"),
|
|
("amazon_rds_for_postgresql", "postgres"),
|
|
# MySQL variants
|
|
("mysql", "mysql"),
|
|
("mysql_rds", "mysql"),
|
|
("google_cloud_mysql", "mysql"),
|
|
("amazon_rds_for_mysql", "mysql"),
|
|
# Cloud storage platforms
|
|
("s3", "s3"),
|
|
("amazon_s3", "s3"),
|
|
("google_cloud_storage", "gcs"),
|
|
("gcs", "gcs"),
|
|
],
|
|
)
|
|
def test_critical_platform_mappings(
|
|
self, fivetran_service: str, expected_datahub_platform: str
|
|
) -> None:
|
|
"""Test platform mappings that are critical for connector functionality."""
|
|
result = get_platform_from_fivetran_service(fivetran_service)
|
|
assert result == expected_datahub_platform, (
|
|
f"Platform mapping failed for {fivetran_service}: "
|
|
f"expected {expected_datahub_platform}, got {result}"
|
|
)
|
|
|
|
def test_case_insensitive_platform_detection(self) -> None:
|
|
"""Test that platform detection works regardless of case (important for real-world data)."""
|
|
test_cases = [
|
|
("POSTGRES", "postgres"),
|
|
("PostgreSQL", "postgres"),
|
|
("MYSQL", "mysql"),
|
|
("MySQL", "mysql"),
|
|
("SQL_SERVER", "mssql"),
|
|
("Sql_Server", "mssql"),
|
|
]
|
|
|
|
for service_name, expected_platform in test_cases:
|
|
result = get_platform_from_fivetran_service(service_name)
|
|
assert result == expected_platform, (
|
|
f"Case-insensitive detection failed for {service_name}"
|
|
)
|
|
|
|
def test_unknown_service_fallback_behavior(self) -> None:
|
|
"""Test that unknown services fall back gracefully (critical for connector robustness)."""
|
|
unknown_services = [
|
|
"custom_proprietary_db",
|
|
"new_saas_platform_2024",
|
|
"unknown_connector_type",
|
|
"test_service_123",
|
|
]
|
|
|
|
for service in unknown_services:
|
|
result = get_platform_from_fivetran_service(service)
|
|
# Should return the service name itself for unknown services
|
|
assert result == service, (
|
|
f"Unknown service {service} should return itself, got {result}"
|
|
)
|
|
|
|
def test_empty_and_none_service_handling(self) -> None:
|
|
"""Test edge cases that could break the connector."""
|
|
# Empty string should return "unknown"
|
|
assert get_platform_from_fivetran_service("") == "unknown"
|
|
|
|
# Whitespace-only should be handled (will be returned as-is since no pattern matches)
|
|
result = get_platform_from_fivetran_service(" ")
|
|
# The function returns the service name if no pattern matches, but might do substring matching
|
|
# Let's just verify it doesn't crash and returns a string
|
|
assert isinstance(result, str)
|
|
|
|
def test_substring_matching_priority(self) -> None:
|
|
"""Test that substring matching works correctly for complex service names."""
|
|
# These test real-world Fivetran service names with embedded platform identifiers
|
|
complex_cases = [
|
|
("my_company_postgres_prod", "postgres"),
|
|
("client_mysql_staging", "mysql"),
|
|
("data_warehouse_snowflake", "snowflake"),
|
|
("analytics_bigquery_raw", "bigquery"),
|
|
("logs_kafka_stream", "kafka"),
|
|
]
|
|
|
|
for service_name, expected_platform in complex_cases:
|
|
result = get_platform_from_fivetran_service(service_name)
|
|
assert result == expected_platform, (
|
|
f"Substring matching failed for {service_name}"
|
|
)
|
|
|
|
|
|
class TestConnectorLimits:
|
|
"""Test connector limits that affect performance and behavior."""
|
|
|
|
def test_connector_limits_are_reasonable(self) -> None:
|
|
"""Test that connector limits make sense for production use."""
|
|
# Limits should be positive
|
|
assert DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR > 0
|
|
assert MAX_COLUMN_LINEAGE_PER_CONNECTOR > 0
|
|
assert MAX_JOBS_PER_CONNECTOR > 0
|
|
|
|
# Column lineage should allow more entries than table lineage
|
|
assert (
|
|
MAX_COLUMN_LINEAGE_PER_CONNECTOR > DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR
|
|
)
|
|
|
|
# Job limit should be reasonable for typical connectors
|
|
assert MAX_JOBS_PER_CONNECTOR >= 100 # Should handle at least 100 jobs
|
|
assert MAX_JOBS_PER_CONNECTOR <= 10000 # But not unlimited
|
|
|
|
def test_limits_prevent_memory_issues(self) -> None:
|
|
"""Test that limits are set to prevent memory problems."""
|
|
# These limits should prevent excessive memory usage
|
|
estimated_memory_per_table = 1024 # bytes
|
|
estimated_memory_per_column = 256 # bytes
|
|
estimated_memory_per_job = 512 # bytes
|
|
|
|
# Rough memory estimates (in MB)
|
|
table_memory_mb = (
|
|
DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR * estimated_memory_per_table
|
|
) / (1024 * 1024)
|
|
column_memory_mb = (
|
|
MAX_COLUMN_LINEAGE_PER_CONNECTOR * estimated_memory_per_column
|
|
) / (1024 * 1024)
|
|
job_memory_mb = (MAX_JOBS_PER_CONNECTOR * estimated_memory_per_job) / (
|
|
1024 * 1024
|
|
)
|
|
|
|
# Should be reasonable memory usage (less than 100MB each)
|
|
assert table_memory_mb < 100
|
|
assert column_memory_mb < 100
|
|
assert job_memory_mb < 100
|
|
|
|
|
|
class TestConnectorModes:
|
|
"""Test connector mode enums that affect connector behavior."""
|
|
|
|
def test_fivetran_mode_completeness(self) -> None:
|
|
"""Test that all expected Fivetran modes are available."""
|
|
expected_modes = {"auto", "standard", "enterprise"}
|
|
actual_modes = {mode.value for mode in FivetranMode}
|
|
|
|
assert actual_modes == expected_modes, (
|
|
f"FivetranMode missing expected values. Expected: {expected_modes}, Got: {actual_modes}"
|
|
)
|
|
|
|
def test_data_job_mode_completeness(self) -> None:
|
|
"""Test that all expected data job modes are available."""
|
|
expected_modes = {"consolidated", "per_table"}
|
|
actual_modes = {mode.value for mode in DataJobMode}
|
|
|
|
assert actual_modes == expected_modes, (
|
|
f"DataJobMode missing expected values. Expected: {expected_modes}, Got: {actual_modes}"
|
|
)
|
|
|
|
def test_mode_enum_string_conversion(self) -> None:
|
|
"""Test that mode enums can be used as strings (important for config validation)."""
|
|
# FivetranMode should be usable as strings
|
|
for fivetran_mode in FivetranMode:
|
|
assert isinstance(fivetran_mode.value, str)
|
|
assert len(fivetran_mode.value) > 0
|
|
assert fivetran_mode.value.isalpha() # Should be alphabetic characters only
|
|
|
|
# DataJobMode should be usable as strings
|
|
for job_mode in DataJobMode:
|
|
assert isinstance(job_mode.value, str)
|
|
assert len(job_mode.value) > 0
|