datahub/metadata-ingestion/tests/integration/s3/test_s3.py

import json
import logging
import os
from datetime import datetime
from unittest.mock import Mock, call, patch

import moto.s3
import pytest
from boto3.session import Session
from moto import mock_s3
from pydantic import ValidationError

from datahub.ingestion.run.pipeline import Pipeline, PipelineContext
from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig
from datahub.ingestion.source.aws.s3_boto_utils import (
    list_folders_path,
    list_objects_recursive_path,
)
from datahub.ingestion.source.s3.source import S3Source
from datahub.testing import mce_helpers

logging.getLogger("boto3").setLevel(logging.INFO)
logging.getLogger("botocore").setLevel(logging.INFO)
logging.getLogger("s3transfer").setLevel(logging.INFO)

FROZEN_TIME = "2020-04-14 07:00:00"

FILE_LIST_FOR_VALIDATION = [
    "folder_a/folder_aa/folder_aaa/NPS.7.1.package_data_NPS.6.1_ARCN_Lakes_ChemistryData_v1_csv.csv",
    "folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro",
    "folder_a/folder_aa/folder_aaa/chord_progressions_csv.csv",
    "folder_a/folder_aa/folder_aaa/countries_json.json",
    "folder_a/folder_aa/folder_aaa/food_parquet.parquet",
    "folder_a/folder_aa/folder_aaa/small.csv",
    "folder_a/folder_aa/folder_aaa/wa_fn_usec_hr_employee_attrition_csv.csv",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2019/month=feb/part1.json",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2019/month=feb/part2.json",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2019/month=jan/part1.json",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2019/month=jan/part2.json",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2020/month=feb/part1.json",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2020/month=feb/part2.json",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2020/month=march/part1.json",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2020/month=march/part2.json",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2021/month=april/part1.json",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2021/month=april/part2.json",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2021/month=march/part1.json",
    "folder_a/folder_aa/folder_aaa/folder_aaaa/pokemon_abilities_yearwise_2021/month=march/part2.json",
    "folder_a/folder_aa/folder_aaa/food_csv/part1.csv",
    "folder_a/folder_aa/folder_aaa/food_csv/part2.csv",
    "folder_a/folder_aa/folder_aaa/food_csv/part3.csv",
    "folder_a/folder_aa/folder_aaa/food_parquet/part1.parquet",
    "folder_a/folder_aa/folder_aaa/food_parquet/part2.parquet",
    "folder_a/folder_aa/folder_aaa/no_extension/small",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2019/month=feb/part1.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2019/month=feb/part2.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2019/month=jan/part1.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2019/month=jan/part2.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2020/month=feb/part1.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2020/month=feb/part2.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2020/month=march/part1.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2020/month=march/part2.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2021/month=april/part1.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2021/month=april/part2.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2021/month=march/part1.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2021/month=march/part2.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/part3.json",
    "folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/_temporary/dummy.json",
]


@pytest.fixture(scope="module", autouse=True)
def bucket_names():
    return ["my-test-bucket", "my-test-bucket-2"]


@pytest.fixture(scope="module", autouse=True)
def s3():
    with mock_s3():
        conn = Session(
            aws_access_key_id="test",
            aws_secret_access_key="test",
            region_name="us-east-1",
        )
        yield conn


@pytest.fixture(scope="module", autouse=True)
def s3_resource(s3):
    with mock_s3():
        conn = s3.resource("s3")
        yield conn


@pytest.fixture(scope="module", autouse=True)
def s3_client(s3):
    with mock_s3():
        conn = s3.client("s3")
        yield conn


def get_descriptive_id(source_tuple):
    source_dir, source_file = source_tuple
    dir_name = os.path.basename(source_dir)
    base_name = source_file.replace(".json", "")
    return f"{dir_name}_{base_name}"


@pytest.fixture(scope="module", autouse=True)
def s3_populate(pytestconfig, s3_resource, s3_client, bucket_names):
    for bucket_name in bucket_names:
        logging.info(f"Populating s3 bucket: {bucket_name}")
        s3_resource.create_bucket(Bucket=bucket_name)
        bkt = s3_resource.Bucket(bucket_name)
        bkt.Tagging().put(Tagging={"TagSet": [{"Key": "foo", "Value": "bar"}]})
        test_resources_dir = (
            pytestconfig.rootpath / "tests/integration/s3/test_data/local_system/"
        )

        current_time_sec = datetime.strptime(
            FROZEN_TIME, "%Y-%m-%d %H:%M:%S"
        ).timestamp()
        file_list = []
        for root, _dirs, files in os.walk(test_resources_dir):
            _dirs.sort()
            for file in sorted(files):
                full_path = os.path.join(root, file)
                rel_path = os.path.relpath(full_path, test_resources_dir)
                file_list.append(rel_path)
                bkt.upload_file(
                    full_path,
                    rel_path,  # Set content type for `no_extension/small` file to text/csv
                    ExtraArgs=(
                        {"ContentType": "text/csv"} if "." not in rel_path else {}
                    ),
                )
                s3_client.put_object_tagging(
                    Bucket=bucket_name,
                    Key=rel_path,
                    Tagging={"TagSet": [{"Key": "baz", "Value": "bob"}]},
                )
                key = (
                    moto.s3.models.s3_backends["123456789012"]["global"]
                    .buckets[bucket_name]
                    .keys[rel_path]
                )
                current_time_sec += 10
                key.last_modified = datetime.fromtimestamp(current_time_sec)

        # This is used to make sure the list of files are the same in the test as locally
        assert file_list == FILE_LIST_FOR_VALIDATION
    yield


@pytest.fixture(scope="module", autouse=True)
def touch_local_files(pytestconfig):
    test_resources_dir = (
        pytestconfig.rootpath / "tests/integration/s3/test_data/local_system/"
    )
    current_time_sec = datetime.strptime(FROZEN_TIME, "%Y-%m-%d %H:%M:%S").timestamp()

    for root, _dirs, files in os.walk(test_resources_dir):
        _dirs.sort()
        for file in sorted(files):
            current_time_sec += 10
            full_path = os.path.join(root, file)
            os.utime(full_path, times=(current_time_sec, current_time_sec))


SHARED_SOURCE_FILES_PATH = "./tests/integration/s3/sources/shared"
shared_source_files = [
    (SHARED_SOURCE_FILES_PATH, p) for p in os.listdir(SHARED_SOURCE_FILES_PATH)
]

S3_SOURCE_FILES_PATH = "./tests/integration/s3/sources/s3"
s3_source_files = [(S3_SOURCE_FILES_PATH, p) for p in os.listdir(S3_SOURCE_FILES_PATH)]


@pytest.mark.integration
@pytest.mark.parametrize(
    "source_file_tuple", shared_source_files + s3_source_files, ids=get_descriptive_id
)
def test_data_lake_s3_ingest(
    pytestconfig, s3_populate, source_file_tuple, tmp_path, mock_time
):
    source_dir, source_file = source_file_tuple
    test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/"

    f = open(os.path.join(source_dir, source_file))
    source = json.load(f)

    config_dict = {}
    config_dict["source"] = source
    config_dict["sink"] = {
        "type": "file",
        "config": {
            "filename": f"{tmp_path}/{source_file}",
        },
    }

    config_dict["run_id"] = source_file

    pipeline = Pipeline.create(config_dict)
    pipeline.run()
    pipeline.raise_from_status()

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=f"{tmp_path}/{source_file}",
        golden_path=f"{test_resources_dir}/golden-files/s3/golden_mces_{source_file}",
        ignore_paths=[
            r"root\[\d+\]\['aspect'\]\['json'\]\['lastUpdatedTimestamp'\]",
        ],
    )


@pytest.mark.integration
@pytest.mark.parametrize(
    "source_file_tuple", shared_source_files + s3_source_files, ids=get_descriptive_id
)
def test_data_lake_gcs_ingest(
    pytestconfig, s3_populate, source_file_tuple, tmp_path, mock_time
):
    source_dir, source_file = source_file_tuple
    test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/"

    f = open(os.path.join(source_dir, source_file))
    source = json.load(f)

    config_dict = {}

    source["type"] = "gcs"
    source["config"]["credential"] = {
        "hmac_access_id": source["config"]["aws_config"]["aws_access_key_id"],
        "hmac_access_secret": source["config"]["aws_config"]["aws_secret_access_key"],
    }
    for path_spec in source["config"]["path_specs"]:
        path_spec["include"] = path_spec["include"].replace("s3://", "gs://")
    source["config"].pop("aws_config")
    source["config"].pop("profiling", None)
    source["config"].pop("sort_schema_fields", None)
    source["config"].pop("use_s3_bucket_tags", None)
    source["config"].pop("use_s3_content_type", None)
    source["config"].pop("use_s3_object_tags", None)

    config_dict["source"] = source
    config_dict["sink"] = {
        "type": "file",
        "config": {
            "filename": f"{tmp_path}/{source_file}",
        },
    }

    config_dict["run_id"] = source_file

    with patch("datahub.ingestion.source.gcs.gcs_source.GCS_ENDPOINT_URL", None):
        pipeline = Pipeline.create(config_dict)
    pipeline.run()
    pipeline.raise_from_status()

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=f"{tmp_path}/{source_file}",
        golden_path=f"{test_resources_dir}/golden-files/gcs/golden_mces_{source_file}",
        ignore_paths=[
            r"root\[\d+\]\['aspect'\]\['json'\]\['lastUpdatedTimestamp'\]",
        ],
    )


@pytest.mark.integration
@pytest.mark.parametrize(
    "source_file_tuple", shared_source_files, ids=get_descriptive_id
)
def test_data_lake_local_ingest(
    pytestconfig, touch_local_files, source_file_tuple, tmp_path, mock_time
):
    source_dir, source_file = source_file_tuple
    test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/"
    f = open(os.path.join(source_dir, source_file))
    source = json.load(f)

    config_dict = {}
    for path_spec in source["config"]["path_specs"]:
        path_spec["include"] = (
            path_spec["include"]
            .replace(
                "s3://my-test-bucket/", "tests/integration/s3/test_data/local_system/"
            )
            .replace(
                "s3://my-test-bucket-2/", "tests/integration/s3/test_data/local_system/"
            )
        )

    source["config"]["profiling"]["enabled"] = True
    source["config"].pop("aws_config")
    source["config"].pop("use_s3_bucket_tags", None)
    source["config"].pop("use_s3_object_tags", None)
    config_dict["source"] = source
    config_dict["sink"] = {
        "type": "file",
        "config": {
            "filename": f"{tmp_path}/{source_file}",
        },
    }

    config_dict["run_id"] = source_file

    pipeline = Pipeline.create(config_dict)
    pipeline.run()
    pipeline.raise_from_status()

    # Verify the output.
    mce_helpers.check_golden_file(
        pytestconfig,
        output_path=f"{tmp_path}/{source_file}",
        golden_path=f"{test_resources_dir}/golden-files/local/golden_mces_{source_file}",
        ignore_paths=[
            r"root\[\d+\]\['aspect'\]\['json'\]\['lastUpdatedTimestamp'\]",
            r"root\[\d+\]\['aspect'\]\['json'\]\[\d+\]\['value'\]\['time'\]",
            r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['created'\]\['time'\]",
            # root[41]['aspect']['json']['fieldProfiles'][0]['sampleValues'][0]
            r"root\[\d+\]\['aspect'\]\['json'\]\['fieldProfiles'\]\[\d+\]\['sampleValues'\]",
            #        "root[0]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][2]['com.linkedin.pegasus2avro.schema.SchemaMetadata']['fields'][4]"
            r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.schema.SchemaMetadata'\]\['fields'\]",
            #    "root[0]['proposedSnapshot']['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot']['aspects'][1]['com.linkedin.pegasus2avro.dataset.DatasetProperties']['customProperties']['size_in_bytes']"
            r"root\[\d+\]\['proposedSnapshot'\]\['com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot'\]\['aspects'\]\[\d+\]\['com.linkedin.pegasus2avro.dataset.DatasetProperties'\]\['customProperties'\]\['size_in_bytes'\]",
        ],
    )


def test_data_lake_incorrect_config_raises_error(tmp_path, mock_time):
    ctx = PipelineContext(run_id="test-s3")

    # Baseline: valid config
    source: dict = {
        "path_spec": {"include": "a/b/c/d/{table}.*", "table_name": "{table}"}
    }
    s3 = S3Source.create(source, ctx)
    assert s3.source_config.platform == "file"

    # Case 1 : named variable in table name is not present in include
    source = {"path_spec": {"include": "a/b/c/d/{table}.*", "table_name": "{table1}"}}
    with pytest.raises(ValidationError, match="table_name"):
        S3Source.create(source, ctx)

    # Case 2 : named variable in exclude is not allowed
    source = {
        "path_spec": {
            "include": "a/b/c/d/{table}/*.*",
            "exclude": ["a/b/c/d/a-{exclude}/**"],
        },
    }
    with pytest.raises(ValidationError, match=r"exclude.*named variable"):
        S3Source.create(source, ctx)

    # Case 3 : unsupported file type not allowed
    source = {
        "path_spec": {
            "include": "a/b/c/d/{table}/*.hd5",
        }
    }
    with pytest.raises(ValidationError, match="file type"):
        S3Source.create(source, ctx)

    # Case 4 : ** in include not allowed
    source = {
        "path_spec": {
            "include": "a/b/c/d/**/*.*",
        },
    }
    with pytest.raises(ValidationError, match=r"\*\*"):
        S3Source.create(source, ctx)


@pytest.mark.parametrize(
    "calls_test_tuple",
    [
        (
            "partitions_and_filename_with_prefix",
            {
                "include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year={year}/month={month}/part*.json",
                "tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]},
            },
            [
                call.list_folders_path(
                    "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/"
                ),
                call.list_folders_path(
                    s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/",
                    startswith="year=",
                ),
                call.list_folders_path(
                    s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/",
                    startswith="month=",
                ),
                call.list_objects_recursive_path(
                    "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/",
                    startswith="part",
                ),
            ],
        ),
        (
            "filter_specific_partition",
            {
                "include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year=2022/month={month}/*.json",
                "tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]},
            },
            [
                call.list_folders_path(
                    "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/"
                ),
                call.list_folders_path(
                    s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022",
                    startswith="month=",
                ),
                call.list_objects_recursive_path(
                    "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/",
                    startswith="",
                ),
            ],
        ),
        (
            "partition_autodetection",
            {
                "include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/",
                "tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]},
            },
            [
                call.list_folders_path(
                    "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/"
                ),
                call.list_folders_path(
                    s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/",
                    startswith="",
                ),
                call.list_folders_path(
                    s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/",
                    startswith="",
                ),
                call.list_folders_path(
                    s3_uri="s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/",
                    startswith="",
                ),
                call.list_objects_recursive_path(
                    "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022/month=jan/",
                    startswith="",
                ),
            ],
        ),
        (
            "partitions_traversal_all",
            {
                "include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year={year}/month={month}/*.json",
                "tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]},
                "traversal_method": "ALL",
            },
            [
                call.list_folders_path(
                    "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/"
                ),
                call.list_objects_recursive_path(
                    "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/",
                    startswith="year=",
                ),
            ],
        ),
        (
            "filter_specific_partition_traversal_all",
            {
                "include": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/{table}/year=2022/month={month}/part*.json",
                "tables_filter_pattern": {"allow": ["^pokemon_abilities_json$"]},
                "traversal_method": "ALL",
            },
            [
                call.list_folders_path(
                    "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/"
                ),
                call.list_objects_recursive_path(
                    "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/pokemon_abilities_json/year=2022",
                    startswith="month=",
                ),
            ],
        ),
    ],
    ids=lambda calls_test_tuple: calls_test_tuple[0],
)
def test_data_lake_s3_calls(s3_populate, calls_test_tuple):
    _, path_spec, expected_calls = calls_test_tuple

    ctx = PipelineContext(run_id="test-s3")
    config = {
        "path_specs": [path_spec],
        "aws_config": {
            "aws_region": "us-east-1",
            "aws_access_key_id": "testing",
            "aws_secret_access_key": "testing",
        },
    }
    source = S3Source.create(config, ctx)

    m = Mock()
    m.list_folders_path.side_effect = list_folders_path
    m.list_objects_recursive_path.side_effect = list_objects_recursive_path

    with (
        patch(
            "datahub.ingestion.source.s3.source.list_folders_path", m.list_folders_path
        ),
        patch(
            "datahub.ingestion.source.s3.source.list_objects_recursive_path",
            m.list_objects_recursive_path,
        ),
    ):
        for _ in source.get_workunits_internal():
            pass

    # Verify S3 calls. We're checking that we make the minimum necessary calls with
    # prefixes when possible to reduce the amount of queries to the S3 API.
    calls = []
    for c in m.mock_calls:
        if isinstance(c.kwargs, dict):  # type assertion
            c.kwargs.pop("aws_config", None)
        if len(c.args) == 3 and isinstance(c.args[2], AwsConnectionConfig):
            c = getattr(call, c[0])(*(c.args[:2]), **c.kwargs)
        calls.append(c)

    assert calls == expected_calls