datahub/metadata-ingestion/tests/unit/s3/test_s3_source.py

94 lines
3.1 KiB
Python
Raw Normal View History

from datahub.ingestion.source.data_lake_common.path_spec import PathSpec
from datahub.ingestion.source.s3.source import partitioned_folder_comparator
def test_partition_comparator_numeric_folder_name():
folder1 = "3"
folder2 = "12"
assert partitioned_folder_comparator(folder1, folder2) == -1
def test_partition_multi_level_key():
folder1 = "backup/metadata_aspect_v2/year=2023/month=01"
folder2 = "backup/metadata_aspect_v2/year=2023/month=2"
assert partitioned_folder_comparator(folder1, folder2) == -1
def test_partition_comparator_numeric_folder_name2():
folder1 = "12"
folder2 = "3"
assert partitioned_folder_comparator(folder1, folder2) == 1
def test_partition_comparator_string_folder():
folder1 = "myfolderB"
folder2 = "myFolderA"
assert partitioned_folder_comparator(folder1, folder2) == 1
def test_partition_comparator_string_same_folder():
folder1 = "myFolderA"
folder2 = "myFolderA"
assert partitioned_folder_comparator(folder1, folder2) == 0
def test_partition_comparator_with_numeric_partition():
folder1 = "year=3"
folder2 = "year=12"
assert partitioned_folder_comparator(folder1, folder2) == -1
def test_partition_comparator_with_padded_numeric_partition():
folder1 = "year=03"
folder2 = "year=12"
assert partitioned_folder_comparator(folder1, folder2) == -1
def test_partition_comparator_with_equal_sign_in_name():
folder1 = "month=12"
folder2 = "year=0"
assert partitioned_folder_comparator(folder1, folder2) == -1
def test_partition_comparator_with_string_partition():
folder1 = "year=year2020"
folder2 = "year=year2021"
assert partitioned_folder_comparator(folder1, folder2) == -1
def test_path_spec():
path_spec = PathSpec(
include="s3://my-bucket/my-folder/year=*/month=*/day=*/*.csv",
default_extension="csv",
)
path = "s3://my-bucket/my-folder/year=2022/month=10/day=11/my_csv.csv"
assert path_spec.allowed(path)
def test_path_spec_dir_allowed():
path_spec = PathSpec(
include="s3://my-bucket/my-folder/year=*/month=*/day=*/*.csv",
exclude=[
"s3://my-bucket/my-folder/year=2022/month=12/day=11",
"s3://my-bucket/my-folder/year=2022/month=10/**",
],
default_extension="csv",
)
path = "s3://my-bucket/my-folder/year=2022/"
assert path_spec.dir_allowed(path) is True, f"{path} should be allowed"
path = "s3://my-bucket/my-folder/year=2022/month=12/"
assert path_spec.dir_allowed(path) is True, f"{path} should be allowed"
path = "s3://my-bucket/my-folder/year=2022/month=12/day=11/my_csv.csv"
assert path_spec.dir_allowed(path) is False, f"{path} should be denied"
path = "s3://my-bucket/my-folder/year=2022/month=12/day=10/"
assert path_spec.dir_allowed(path) is True, f"{path} should be allowed"
path = "s3://my-bucket/my-folder/year=2022/month=12/day=10/_temporary/"
assert path_spec.dir_allowed(path) is False, f"{path} should be denied"
path = "s3://my-bucket/my-folder/year=2022/month=10/day=10/"
assert path_spec.dir_allowed(path) is False, f"{path} should be denied"