datahub/metadata-ingestion/tests/unit/fivetran/test_fivetran_platform_mapping.py
2025-09-13 11:39:03 +01:00

195 lines
7.7 KiB
Python

"""
Unit tests for fivetran_constants.py
Tests focus on business logic and edge cases that matter for connector functionality.
"""
import pytest
from datahub.ingestion.source.fivetran.fivetran_constants import (
DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR,
MAX_COLUMN_LINEAGE_PER_CONNECTOR,
MAX_JOBS_PER_CONNECTOR,
DataJobMode,
FivetranMode,
get_platform_from_fivetran_service,
)
class TestPlatformMapping:
"""Test platform mapping logic that affects connector behavior."""
@pytest.mark.parametrize(
"fivetran_service,expected_datahub_platform",
[
# Critical pattern matching for SQL Server variants
("sql_server", "mssql"),
("sql_server_rds", "mssql"),
("azure_sql", "mssql"),
("azure_sql_database", "mssql"),
# PostgreSQL variants that must map correctly
("postgres", "postgres"),
("postgresql", "postgres"),
("postgres_rds", "postgres"),
("amazon_rds_for_postgresql", "postgres"),
# MySQL variants
("mysql", "mysql"),
("mysql_rds", "mysql"),
("google_cloud_mysql", "mysql"),
("amazon_rds_for_mysql", "mysql"),
# Cloud storage platforms
("s3", "s3"),
("amazon_s3", "s3"),
("google_cloud_storage", "gcs"),
("gcs", "gcs"),
],
)
def test_critical_platform_mappings(
self, fivetran_service: str, expected_datahub_platform: str
) -> None:
"""Test platform mappings that are critical for connector functionality."""
result = get_platform_from_fivetran_service(fivetran_service)
assert result == expected_datahub_platform, (
f"Platform mapping failed for {fivetran_service}: "
f"expected {expected_datahub_platform}, got {result}"
)
def test_case_insensitive_platform_detection(self) -> None:
"""Test that platform detection works regardless of case (important for real-world data)."""
test_cases = [
("POSTGRES", "postgres"),
("PostgreSQL", "postgres"),
("MYSQL", "mysql"),
("MySQL", "mysql"),
("SQL_SERVER", "mssql"),
("Sql_Server", "mssql"),
]
for service_name, expected_platform in test_cases:
result = get_platform_from_fivetran_service(service_name)
assert result == expected_platform, (
f"Case-insensitive detection failed for {service_name}"
)
def test_unknown_service_fallback_behavior(self) -> None:
"""Test that unknown services fall back gracefully (critical for connector robustness)."""
unknown_services = [
"custom_proprietary_db",
"new_saas_platform_2024",
"unknown_connector_type",
"test_service_123",
]
for service in unknown_services:
result = get_platform_from_fivetran_service(service)
# Should return the service name itself for unknown services
assert result == service, (
f"Unknown service {service} should return itself, got {result}"
)
def test_empty_and_none_service_handling(self) -> None:
"""Test edge cases that could break the connector."""
# Empty string should return "unknown"
assert get_platform_from_fivetran_service("") == "unknown"
# Whitespace-only should be handled (will be returned as-is since no pattern matches)
result = get_platform_from_fivetran_service(" ")
# The function returns the service name if no pattern matches, but might do substring matching
# Let's just verify it doesn't crash and returns a string
assert isinstance(result, str)
def test_substring_matching_priority(self) -> None:
"""Test that substring matching works correctly for complex service names."""
# These test real-world Fivetran service names with embedded platform identifiers
complex_cases = [
("my_company_postgres_prod", "postgres"),
("client_mysql_staging", "mysql"),
("data_warehouse_snowflake", "snowflake"),
("analytics_bigquery_raw", "bigquery"),
("logs_kafka_stream", "kafka"),
]
for service_name, expected_platform in complex_cases:
result = get_platform_from_fivetran_service(service_name)
assert result == expected_platform, (
f"Substring matching failed for {service_name}"
)
class TestConnectorLimits:
"""Test connector limits that affect performance and behavior."""
def test_connector_limits_are_reasonable(self) -> None:
"""Test that connector limits make sense for production use."""
# Limits should be positive
assert DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR > 0
assert MAX_COLUMN_LINEAGE_PER_CONNECTOR > 0
assert MAX_JOBS_PER_CONNECTOR > 0
# Column lineage should allow more entries than table lineage
assert (
MAX_COLUMN_LINEAGE_PER_CONNECTOR > DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR
)
# Job limit should be reasonable for typical connectors
assert MAX_JOBS_PER_CONNECTOR >= 100 # Should handle at least 100 jobs
assert MAX_JOBS_PER_CONNECTOR <= 10000 # But not unlimited
def test_limits_prevent_memory_issues(self) -> None:
"""Test that limits are set to prevent memory problems."""
# These limits should prevent excessive memory usage
estimated_memory_per_table = 1024 # bytes
estimated_memory_per_column = 256 # bytes
estimated_memory_per_job = 512 # bytes
# Rough memory estimates (in MB)
table_memory_mb = (
DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR * estimated_memory_per_table
) / (1024 * 1024)
column_memory_mb = (
MAX_COLUMN_LINEAGE_PER_CONNECTOR * estimated_memory_per_column
) / (1024 * 1024)
job_memory_mb = (MAX_JOBS_PER_CONNECTOR * estimated_memory_per_job) / (
1024 * 1024
)
# Should be reasonable memory usage (less than 100MB each)
assert table_memory_mb < 100
assert column_memory_mb < 100
assert job_memory_mb < 100
class TestConnectorModes:
"""Test connector mode enums that affect connector behavior."""
def test_fivetran_mode_completeness(self) -> None:
"""Test that all expected Fivetran modes are available."""
expected_modes = {"auto", "standard", "enterprise"}
actual_modes = {mode.value for mode in FivetranMode}
assert actual_modes == expected_modes, (
f"FivetranMode missing expected values. Expected: {expected_modes}, Got: {actual_modes}"
)
def test_data_job_mode_completeness(self) -> None:
"""Test that all expected data job modes are available."""
expected_modes = {"consolidated", "per_table"}
actual_modes = {mode.value for mode in DataJobMode}
assert actual_modes == expected_modes, (
f"DataJobMode missing expected values. Expected: {expected_modes}, Got: {actual_modes}"
)
def test_mode_enum_string_conversion(self) -> None:
"""Test that mode enums can be used as strings (important for config validation)."""
# FivetranMode should be usable as strings
for fivetran_mode in FivetranMode:
assert isinstance(fivetran_mode.value, str)
assert len(fivetran_mode.value) > 0
assert fivetran_mode.value.isalpha() # Should be alphabetic characters only
# DataJobMode should be usable as strings
for job_mode in DataJobMode:
assert isinstance(job_mode.value, str)
assert len(job_mode.value) > 0