datahub/metadata-ingestion/tests/unit/fivetran/test_fivetran_platform_mapping.py

"""
Unit tests for fivetran_constants.py

Tests focus on business logic and edge cases that matter for connector functionality.
"""

import pytest

from datahub.ingestion.source.fivetran.fivetran_constants import (
    DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR,
    MAX_COLUMN_LINEAGE_PER_CONNECTOR,
    MAX_JOBS_PER_CONNECTOR,
    DataJobMode,
    FivetranMode,
    get_platform_from_fivetran_service,
)


class TestPlatformMapping:
    """Test platform mapping logic that affects connector behavior."""

    @pytest.mark.parametrize(
        "fivetran_service,expected_datahub_platform",
        [
            # Critical pattern matching for SQL Server variants
            ("sql_server", "mssql"),
            ("sql_server_rds", "mssql"),
            ("azure_sql", "mssql"),
            ("azure_sql_database", "mssql"),
            # PostgreSQL variants that must map correctly
            ("postgres", "postgres"),
            ("postgresql", "postgres"),
            ("postgres_rds", "postgres"),
            ("amazon_rds_for_postgresql", "postgres"),
            # MySQL variants
            ("mysql", "mysql"),
            ("mysql_rds", "mysql"),
            ("google_cloud_mysql", "mysql"),
            ("amazon_rds_for_mysql", "mysql"),
            # Cloud storage platforms
            ("s3", "s3"),
            ("amazon_s3", "s3"),
            ("google_cloud_storage", "gcs"),
            ("gcs", "gcs"),
        ],
    )
    def test_critical_platform_mappings(
        self, fivetran_service: str, expected_datahub_platform: str
    ) -> None:
        """Test platform mappings that are critical for connector functionality."""
        result = get_platform_from_fivetran_service(fivetran_service)
        assert result == expected_datahub_platform, (
            f"Platform mapping failed for {fivetran_service}: "
            f"expected {expected_datahub_platform}, got {result}"
        )

    def test_case_insensitive_platform_detection(self) -> None:
        """Test that platform detection works regardless of case (important for real-world data)."""
        test_cases = [
            ("POSTGRES", "postgres"),
            ("PostgreSQL", "postgres"),
            ("MYSQL", "mysql"),
            ("MySQL", "mysql"),
            ("SQL_SERVER", "mssql"),
            ("Sql_Server", "mssql"),
        ]

        for service_name, expected_platform in test_cases:
            result = get_platform_from_fivetran_service(service_name)
            assert result == expected_platform, (
                f"Case-insensitive detection failed for {service_name}"
            )

    def test_unknown_service_fallback_behavior(self) -> None:
        """Test that unknown services fall back gracefully (critical for connector robustness)."""
        unknown_services = [
            "custom_proprietary_db",
            "new_saas_platform_2024",
            "unknown_connector_type",
            "test_service_123",
        ]

        for service in unknown_services:
            result = get_platform_from_fivetran_service(service)
            # Should return the service name itself for unknown services
            assert result == service, (
                f"Unknown service {service} should return itself, got {result}"
            )

    def test_empty_and_none_service_handling(self) -> None:
        """Test edge cases that could break the connector."""
        # Empty string should return "unknown"
        assert get_platform_from_fivetran_service("") == "unknown"

        # Whitespace-only should be handled (will be returned as-is since no pattern matches)
        result = get_platform_from_fivetran_service("   ")
        # The function returns the service name if no pattern matches, but might do substring matching
        # Let's just verify it doesn't crash and returns a string
        assert isinstance(result, str)

    def test_substring_matching_priority(self) -> None:
        """Test that substring matching works correctly for complex service names."""
        # These test real-world Fivetran service names with embedded platform identifiers
        complex_cases = [
            ("my_company_postgres_prod", "postgres"),
            ("client_mysql_staging", "mysql"),
            ("data_warehouse_snowflake", "snowflake"),
            ("analytics_bigquery_raw", "bigquery"),
            ("logs_kafka_stream", "kafka"),
        ]

        for service_name, expected_platform in complex_cases:
            result = get_platform_from_fivetran_service(service_name)
            assert result == expected_platform, (
                f"Substring matching failed for {service_name}"
            )


class TestConnectorLimits:
    """Test connector limits that affect performance and behavior."""

    def test_connector_limits_are_reasonable(self) -> None:
        """Test that connector limits make sense for production use."""
        # Limits should be positive
        assert DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR > 0
        assert MAX_COLUMN_LINEAGE_PER_CONNECTOR > 0
        assert MAX_JOBS_PER_CONNECTOR > 0

        # Column lineage should allow more entries than table lineage
        assert (
            MAX_COLUMN_LINEAGE_PER_CONNECTOR > DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR
        )

        # Job limit should be reasonable for typical connectors
        assert MAX_JOBS_PER_CONNECTOR >= 100  # Should handle at least 100 jobs
        assert MAX_JOBS_PER_CONNECTOR <= 10000  # But not unlimited

    def test_limits_prevent_memory_issues(self) -> None:
        """Test that limits are set to prevent memory problems."""
        # These limits should prevent excessive memory usage
        estimated_memory_per_table = 1024  # bytes
        estimated_memory_per_column = 256  # bytes
        estimated_memory_per_job = 512  # bytes

        # Rough memory estimates (in MB)
        table_memory_mb = (
            DEFAULT_MAX_TABLE_LINEAGE_PER_CONNECTOR * estimated_memory_per_table
        ) / (1024 * 1024)
        column_memory_mb = (
            MAX_COLUMN_LINEAGE_PER_CONNECTOR * estimated_memory_per_column
        ) / (1024 * 1024)
        job_memory_mb = (MAX_JOBS_PER_CONNECTOR * estimated_memory_per_job) / (
            1024 * 1024
        )

        # Should be reasonable memory usage (less than 100MB each)
        assert table_memory_mb < 100
        assert column_memory_mb < 100
        assert job_memory_mb < 100


class TestConnectorModes:
    """Test connector mode enums that affect connector behavior."""

    def test_fivetran_mode_completeness(self) -> None:
        """Test that all expected Fivetran modes are available."""
        expected_modes = {"auto", "standard", "enterprise"}
        actual_modes = {mode.value for mode in FivetranMode}

        assert actual_modes == expected_modes, (
            f"FivetranMode missing expected values. Expected: {expected_modes}, Got: {actual_modes}"
        )

    def test_data_job_mode_completeness(self) -> None:
        """Test that all expected data job modes are available."""
        expected_modes = {"consolidated", "per_table"}
        actual_modes = {mode.value for mode in DataJobMode}

        assert actual_modes == expected_modes, (
            f"DataJobMode missing expected values. Expected: {expected_modes}, Got: {actual_modes}"
        )

    def test_mode_enum_string_conversion(self) -> None:
        """Test that mode enums can be used as strings (important for config validation)."""
        # FivetranMode should be usable as strings
        for fivetran_mode in FivetranMode:
            assert isinstance(fivetran_mode.value, str)
            assert len(fivetran_mode.value) > 0
            assert fivetran_mode.value.isalpha()  # Should be alphabetic characters only

        # DataJobMode should be usable as strings
        for job_mode in DataJobMode:
            assert isinstance(job_mode.value, str)
            assert len(job_mode.value) > 0