datahub/metadata-ingestion/tests/unit/test_iceberg.py

import sys

import pytest

if sys.version_info < (3, 7):
    pytest.skip("iceberg not available for python < 3.7", allow_module_level=True)
from typing import Any, Optional

from iceberg.api import types as IcebergTypes
from iceberg.api.types.types import NestedField

from datahub.configuration.common import ConfigurationError
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.azure.azure_common import AdlsSourceConfig
from datahub.ingestion.source.iceberg.iceberg import IcebergSource, IcebergSourceConfig
from datahub.metadata.com.linkedin.pegasus2avro.schema import ArrayType, SchemaField
from datahub.metadata.schema_classes import (
    ArrayTypeClass,
    BooleanTypeClass,
    BytesTypeClass,
    DateTypeClass,
    FixedTypeClass,
    NumberTypeClass,
    RecordTypeClass,
    StringTypeClass,
    TimeTypeClass,
)


def with_iceberg_source() -> IcebergSource:
    adls: AdlsSourceConfig = AdlsSourceConfig(
        account_name="test", account_key="test", container_name="test"
    )
    return IcebergSource(
        ctx=PipelineContext(run_id="iceberg-source-test"),
        config=IcebergSourceConfig(adls=adls),
    )


def assert_field(
    schema_field: SchemaField,
    expected_description: Optional[str],
    expected_nullable: bool,
    expected_type: Any,
) -> None:
    assert (
        schema_field.description == expected_description
    ), f"Field description '{schema_field.description}' is different from expected description '{expected_description}'"
    assert (
        schema_field.nullable == expected_nullable
    ), f"Field nullable '{schema_field.nullable}' is different from expected nullable '{expected_nullable}'"
    assert isinstance(
        schema_field.type.type, expected_type
    ), f"Field type {schema_field.type.type} is different from expected type {expected_type}"


def test_adls_config_no_credential():
    """
    Test when no ADLS credential information is provided (SAS token, Account key).
    """
    with pytest.raises(ConfigurationError):
        AdlsSourceConfig(account_name="test", container_name="test")


def test_adls_config_with_sas_credential():
    """
    Test when a SAS token is used as an ADLS credential.
    """
    AdlsSourceConfig(account_name="test", sas_token="test", container_name="test")


def test_adls_config_with_key_credential():
    """
    Test when an account key is used as an ADLS credential.
    """
    AdlsSourceConfig(account_name="test", account_key="test", container_name="test")


def test_adls_config_with_client_secret_credential():
    """
    Test when a client secret is used as an ADLS credential.
    """
    AdlsSourceConfig(
        account_name="test",
        tenant_id="test",
        client_id="test",
        client_secret="test",
        container_name="test",
    )

    # Test when tenant_id is missing
    with pytest.raises(ConfigurationError):
        AdlsSourceConfig(
            account_name="test",
            client_id="test",
            client_secret="test",
            container_name="test",
        )

    # Test when client_id is missing
    with pytest.raises(ConfigurationError):
        AdlsSourceConfig(
            account_name="test",
            tenant_id="test",
            client_secret="test",
            container_name="test",
        )

    # Test when client_secret is missing
    with pytest.raises(ConfigurationError):
        AdlsSourceConfig(
            account_name="test",
            tenant_id="test",
            client_id="test",
            container_name="test",
        )


def test_config_for_tests():
    """
    Test valid iceberg source that will be used in unit tests.
    """
    with_iceberg_source()


def test_config_no_filesystem():
    """
    Test when a SAS token is used as an ADLS credential.
    """
    with pytest.raises(ConfigurationError):
        IcebergSource(
            ctx=PipelineContext(run_id="iceberg-source-test"),
            config=IcebergSourceConfig(),
        )


def test_config_multiple_filesystems():
    """
    Test when more than 1 filesystem is configured.
    """
    with pytest.raises(ConfigurationError):
        adls: AdlsSourceConfig = AdlsSourceConfig(
            account_name="test", container_name="test"
        )
        IcebergSource(
            ctx=PipelineContext(run_id="iceberg-source-test"),
            config=IcebergSourceConfig(adls=adls, localfs="/tmp"),
        )


@pytest.mark.parametrize(
    "iceberg_type, expected_schema_field_type",
    [
        (IcebergTypes.BinaryType.get(), BytesTypeClass),
        (IcebergTypes.BooleanType.get(), BooleanTypeClass),
        (IcebergTypes.DateType.get(), DateTypeClass),
        (
            IcebergTypes.DecimalType.of(3, 2),
            NumberTypeClass,
        ),
        (IcebergTypes.DoubleType.get(), NumberTypeClass),
        (IcebergTypes.FixedType.of_length(4), FixedTypeClass),
        (IcebergTypes.FloatType.get(), NumberTypeClass),
        (IcebergTypes.IntegerType.get(), NumberTypeClass),
        (IcebergTypes.LongType.get(), NumberTypeClass),
        (IcebergTypes.StringType.get(), StringTypeClass),
        (
            IcebergTypes.TimestampType.with_timezone(),
            TimeTypeClass,
        ),
        (
            IcebergTypes.TimestampType.without_timezone(),
            TimeTypeClass,
        ),
        (IcebergTypes.TimeType.get(), TimeTypeClass),
        (
            IcebergTypes.UUIDType.get(),
            StringTypeClass,
        ),
    ],
)
def test_iceberg_primitive_type_to_schema_field(
    iceberg_type: IcebergTypes.PrimitiveType, expected_schema_field_type: Any
) -> None:
    """
    Test converting a primitive typed Iceberg field to a SchemaField
    """
    iceberg_source_instance = with_iceberg_source()
    for column in [
        NestedField.required(
            1, "required_field", iceberg_type, "required field documentation"
        ),
        NestedField.optional(
            1, "optional_field", iceberg_type, "optional field documentation"
        ),
    ]:
        schema_fields = iceberg_source_instance._get_schema_fields_for_column(column)
        assert (
            len(schema_fields) == 1
        ), f"Expected 1 field, but got {len(schema_fields)}"
        assert_field(
            schema_fields[0], column.doc, column.is_optional, expected_schema_field_type
        )


@pytest.mark.parametrize(
    "iceberg_type, expected_array_nested_type",
    [
        (IcebergTypes.BinaryType.get(), "bytes"),
        (IcebergTypes.BooleanType.get(), "boolean"),
        (IcebergTypes.DateType.get(), "date"),
        (
            IcebergTypes.DecimalType.of(3, 2),
            "decimal",
        ),
        (IcebergTypes.DoubleType.get(), "double"),
        (IcebergTypes.FixedType.of_length(4), "fixed"),
        (IcebergTypes.FloatType.get(), "float"),
        (IcebergTypes.IntegerType.get(), "int"),
        (IcebergTypes.LongType.get(), "long"),
        (IcebergTypes.StringType.get(), "string"),
        (
            IcebergTypes.TimestampType.with_timezone(),
            "timestamp-micros",
        ),
        (
            IcebergTypes.TimestampType.without_timezone(),
            "timestamp-micros",
        ),
        (IcebergTypes.TimeType.get(), "time-micros"),
        (
            IcebergTypes.UUIDType.get(),
            "uuid",
        ),
    ],
)
def test_iceberg_list_to_schema_field(
    iceberg_type: IcebergTypes.PrimitiveType, expected_array_nested_type: Any
) -> None:
    """
    Test converting a list typed Iceberg field to an ArrayType SchemaField, including the list nested type.
    """
    list_column: NestedField = NestedField.required(
        1,
        "listField",
        IcebergTypes.ListType.of_required(2, iceberg_type),
        "documentation",
    )
    iceberg_source_instance = with_iceberg_source()
    schema_fields = iceberg_source_instance._get_schema_fields_for_column(list_column)
    assert len(schema_fields) == 1, f"Expected 1 field, but got {len(schema_fields)}"
    assert_field(
        schema_fields[0], list_column.doc, list_column.is_optional, ArrayTypeClass
    )
    assert isinstance(
        schema_fields[0].type.type, ArrayType
    ), f"Field type {schema_fields[0].type.type} was expected to be {ArrayType}"
    arrayType: ArrayType = schema_fields[0].type.type
    assert arrayType.nestedType == [
        expected_array_nested_type
    ], f"List Field nested type {arrayType.nestedType} was expected to be {expected_array_nested_type}"


@pytest.mark.parametrize(
    "iceberg_type, expected_map_type",
    [
        (IcebergTypes.BinaryType.get(), BytesTypeClass),
        (IcebergTypes.BooleanType.get(), BooleanTypeClass),
        (IcebergTypes.DateType.get(), DateTypeClass),
        (
            IcebergTypes.DecimalType.of(3, 2),
            NumberTypeClass,
        ),
        (IcebergTypes.DoubleType.get(), NumberTypeClass),
        (IcebergTypes.FixedType.of_length(4), FixedTypeClass),
        (IcebergTypes.FloatType.get(), NumberTypeClass),
        (IcebergTypes.IntegerType.get(), NumberTypeClass),
        (IcebergTypes.LongType.get(), NumberTypeClass),
        (IcebergTypes.StringType.get(), StringTypeClass),
        (
            IcebergTypes.TimestampType.with_timezone(),
            TimeTypeClass,
        ),
        (
            IcebergTypes.TimestampType.without_timezone(),
            TimeTypeClass,
        ),
        (IcebergTypes.TimeType.get(), TimeTypeClass),
        (
            IcebergTypes.UUIDType.get(),
            StringTypeClass,
        ),
    ],
)
def test_iceberg_map_to_schema_field(
    iceberg_type: IcebergTypes.PrimitiveType, expected_map_type: Any
) -> None:
    """
    Test converting a map typed Iceberg field to a MapType SchemaField, where the key is the same type as the value.
    """
    map_column: NestedField = NestedField.required(
        1,
        "mapField",
        IcebergTypes.MapType.of_required(11, 12, iceberg_type, iceberg_type),
        "documentation",
    )
    iceberg_source_instance = with_iceberg_source()
    schema_fields = iceberg_source_instance._get_schema_fields_for_column(map_column)
    # Converting an Iceberg Map type will be done by creating an array of struct(key, value) records.
    # The first field will be the array.
    assert len(schema_fields) == 3, f"Expected 3 fields, but got {len(schema_fields)}"
    assert_field(
        schema_fields[0], map_column.doc, map_column.is_optional, ArrayTypeClass
    )

    # The second field will be the key type
    assert_field(schema_fields[1], None, False, expected_map_type)

    # The third field will be the value type
    assert_field(schema_fields[2], None, True, expected_map_type)


@pytest.mark.parametrize(
    "iceberg_type, expected_schema_field_type",
    [
        (IcebergTypes.BinaryType.get(), BytesTypeClass),
        (IcebergTypes.BooleanType.get(), BooleanTypeClass),
        (IcebergTypes.DateType.get(), DateTypeClass),
        (
            IcebergTypes.DecimalType.of(3, 2),
            NumberTypeClass,
        ),
        (IcebergTypes.DoubleType.get(), NumberTypeClass),
        (IcebergTypes.FixedType.of_length(4), FixedTypeClass),
        (IcebergTypes.FloatType.get(), NumberTypeClass),
        (IcebergTypes.IntegerType.get(), NumberTypeClass),
        (IcebergTypes.LongType.get(), NumberTypeClass),
        (IcebergTypes.StringType.get(), StringTypeClass),
        (
            IcebergTypes.TimestampType.with_timezone(),
            TimeTypeClass,
        ),
        (
            IcebergTypes.TimestampType.without_timezone(),
            TimeTypeClass,
        ),
        (IcebergTypes.TimeType.get(), TimeTypeClass),
        (
            IcebergTypes.UUIDType.get(),
            StringTypeClass,
        ),
    ],
)
def test_iceberg_struct_to_schema_field(
    iceberg_type: IcebergTypes.PrimitiveType, expected_schema_field_type: Any
) -> None:
    """
    Test converting a struct typed Iceberg field to a RecordType SchemaField.
    """
    field1: NestedField = NestedField.required(
        11, "field1", iceberg_type, "field documentation"
    )
    struct_column: NestedField = NestedField.required(
        1, "structField", IcebergTypes.StructType.of([field1]), "struct documentation"
    )
    iceberg_source_instance = with_iceberg_source()
    schema_fields = iceberg_source_instance._get_schema_fields_for_column(struct_column)
    assert len(schema_fields) == 2, f"Expected 2 fields, but got {len(schema_fields)}"
    assert_field(
        schema_fields[0], struct_column.doc, struct_column.is_optional, RecordTypeClass
    )
    assert_field(
        schema_fields[1], field1.doc, field1.is_optional, expected_schema_field_type
    )


def test_avro_decimal_bytes_nullable():
    """
    The following test exposes a problem with decimal (bytes) not preserving extra attributes like _nullable.  Decimal (fixed) and Boolean for example do.
    NOTE: This bug was by-passed by mapping the Decimal type to fixed instead of bytes.
    """
    import avro.schema

    decimal_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "bytes", "precision": 3, "scale": 2, "logicalType": "decimal", "native_data_type": "decimal(3, 2)", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}"""
    decimal_avro_schema = avro.schema.parse(decimal_avro_schema_string)
    print("\nDecimal (bytes)")
    print(
        f"Original avro schema string:                         {decimal_avro_schema_string}"
    )
    print(f"After avro parsing, _nullable attribute is missing:  {decimal_avro_schema}")

    decimal_fixed_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "fixed", "logicalType": "decimal", "precision": 3, "scale": 2, "native_data_type": "decimal(3, 2)", "_nullable": false, "name": "bogusName", "size": 16}, "name": "required_field", "doc": "required field documentation"}]}"""
    decimal_fixed_avro_schema = avro.schema.parse(decimal_fixed_avro_schema_string)
    print("\nDecimal (fixed)")
    print(
        f"Original avro schema string:                           {decimal_fixed_avro_schema_string}"
    )
    print(
        f"After avro parsing, _nullable attribute is preserved:  {decimal_fixed_avro_schema}"
    )

    boolean_avro_schema_string = """{"type": "record", "name": "__struct_", "fields": [{"type": {"type": "boolean", "native_data_type": "boolean", "_nullable": false}, "name": "required_field", "doc": "required field documentation"}]}"""
    boolean_avro_schema = avro.schema.parse(boolean_avro_schema_string)
    print("\nBoolean")
    print(
        f"Original avro schema string:                           {boolean_avro_schema_string}"
    )
    print(
        f"After avro parsing, _nullable attribute is preserved:  {boolean_avro_schema}"
    )