2024-05-16 10:03:27 +02:00
|
|
|
# Copyright 2021 Collate
|
|
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
# you may not use this file except in compliance with the License.
|
|
|
|
# You may obtain a copy of the License at
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
# See the License for the specific language governing permissions and
|
|
|
|
# limitations under the License.
|
|
|
|
"""S3 integration tests"""
|
|
|
|
import sys
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
from metadata.generated.schema.entity.data.container import Container, FileFormat
|
|
|
|
from metadata.generated.schema.entity.services.storageService import StorageService
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skipif(
|
|
|
|
sys.version_info < (3, 9),
|
|
|
|
reason="testcontainers Network feature requires python3.9 or higher",
|
|
|
|
)
|
|
|
|
def test_s3_ingestion(metadata, ingest_s3_storage, service_name):
|
|
|
|
"""Test the ingestion is working as expected"""
|
|
|
|
|
|
|
|
service: StorageService = metadata.get_by_name(
|
|
|
|
entity=StorageService, fqn=service_name
|
|
|
|
)
|
|
|
|
assert service
|
|
|
|
|
|
|
|
# We should have the bucket and all its structured children
|
|
|
|
bucket: Container = metadata.get_by_name(
|
|
|
|
entity=Container, fqn=f"{service_name}.test-bucket", fields=["*"]
|
|
|
|
)
|
|
|
|
# The bucket has children and no dataModel
|
2024-07-31 00:05:58 +05:30
|
|
|
assert 7 == len(bucket.children.root)
|
2024-05-16 10:03:27 +02:00
|
|
|
assert not bucket.dataModel
|
|
|
|
|
|
|
|
# We can validate the children
|
|
|
|
cities: Container = metadata.get_by_name(
|
|
|
|
entity=Container, fqn=f"{service_name}.test-bucket.cities", fields=["*"]
|
|
|
|
)
|
|
|
|
assert cities.dataModel.isPartitioned
|
|
|
|
assert 9 == len(cities.dataModel.columns)
|
|
|
|
assert FileFormat.parquet in cities.fileFormats
|
|
|
|
|
|
|
|
cities_multiple: Container = metadata.get_by_name(
|
|
|
|
entity=Container,
|
|
|
|
fqn=f"{service_name}.test-bucket.cities_multiple",
|
|
|
|
fields=["*"],
|
|
|
|
)
|
|
|
|
assert cities_multiple.dataModel.isPartitioned
|
|
|
|
assert 11 == len(cities_multiple.dataModel.columns)
|
|
|
|
assert FileFormat.parquet in cities_multiple.fileFormats
|
|
|
|
|
|
|
|
cities_multiple_simple: Container = metadata.get_by_name(
|
|
|
|
entity=Container,
|
|
|
|
fqn=f"{service_name}.test-bucket.cities_multiple_simple",
|
|
|
|
fields=["*"],
|
|
|
|
)
|
|
|
|
assert cities_multiple_simple.dataModel.isPartitioned
|
|
|
|
assert 10 == len(cities_multiple_simple.dataModel.columns)
|
|
|
|
assert FileFormat.parquet in cities_multiple_simple.fileFormats
|
|
|
|
|
|
|
|
transactions: Container = metadata.get_by_name(
|
|
|
|
entity=Container, fqn=f"{service_name}.test-bucket.transactions", fields=["*"]
|
|
|
|
)
|
|
|
|
assert not transactions.dataModel.isPartitioned
|
|
|
|
assert 2 == len(transactions.dataModel.columns)
|
|
|
|
assert FileFormat.csv in transactions.fileFormats
|
|
|
|
|
|
|
|
transactions_separator: Container = metadata.get_by_name(
|
|
|
|
entity=Container,
|
|
|
|
fqn=f"{service_name}.test-bucket.transactions_separator",
|
|
|
|
fields=["*"],
|
|
|
|
)
|
|
|
|
assert not transactions_separator.dataModel.isPartitioned
|
|
|
|
assert 2 == len(transactions_separator.dataModel.columns)
|
|
|
|
assert FileFormat.csv in transactions_separator.fileFormats
|
2024-07-08 15:24:39 +05:30
|
|
|
|
|
|
|
png_file: Container = metadata.get_by_name(
|
|
|
|
entity=Container,
|
|
|
|
fqn=f'{service_name}.test-bucket."solved.png"',
|
|
|
|
fields=["*"],
|
|
|
|
)
|
|
|
|
assert not png_file.dataModel
|
|
|
|
assert png_file.size > 1000
|
2024-07-31 00:05:58 +05:30
|
|
|
|
|
|
|
# validate unstructured parent containers
|
|
|
|
container1: Container = metadata.get_by_name(
|
|
|
|
entity=Container,
|
|
|
|
fqn=f"{service_name}.test-bucket.docs_images",
|
|
|
|
fields=["*"],
|
|
|
|
)
|
|
|
|
assert not container1.dataModel
|
|
|
|
|
|
|
|
container2: Container = metadata.get_by_name(
|
|
|
|
entity=Container,
|
|
|
|
fqn=f"{service_name}.test-bucket.docs_images.storage",
|
|
|
|
fields=["*"],
|
|
|
|
)
|
|
|
|
assert not container2.dataModel
|
|
|
|
|
|
|
|
container3: Container = metadata.get_by_name(
|
|
|
|
entity=Container,
|
|
|
|
fqn=f"{service_name}.test-bucket.docs_images.storage.s3",
|
|
|
|
fields=["*"],
|
|
|
|
)
|
|
|
|
assert not container3.dataModel
|
|
|
|
|
|
|
|
# validate images container
|
|
|
|
image1: Container = metadata.get_by_name(
|
|
|
|
entity=Container,
|
|
|
|
fqn=f'{service_name}.test-bucket.docs_images.storage.s3."add-new-service.png"',
|
|
|
|
fields=["*"],
|
|
|
|
)
|
|
|
|
assert not image1.dataModel
|
|
|
|
assert image1.size > 100
|
|
|
|
|
|
|
|
image1: Container = metadata.get_by_name(
|
|
|
|
entity=Container,
|
|
|
|
fqn=f'{service_name}.test-bucket.docs_images.storage."s3-demo.png"',
|
|
|
|
fields=["*"],
|
|
|
|
)
|
|
|
|
assert not image1.dataModel
|
|
|
|
assert image1.size > 100
|
|
|
|
|
|
|
|
image2: Container = metadata.get_by_name(
|
|
|
|
entity=Container,
|
|
|
|
fqn=f'{service_name}.test-bucket.docs_images.synapse."add-new-service.webp"',
|
|
|
|
fields=["*"],
|
|
|
|
)
|
|
|
|
assert not image2.dataModel
|
|
|
|
assert image2.size > 100
|
|
|
|
|
|
|
|
image3: Container = metadata.get_by_name(
|
|
|
|
entity=Container,
|
|
|
|
fqn=f'{service_name}.test-bucket.docs_images.domodatabase."scopes.jpeg"',
|
|
|
|
fields=["*"],
|
|
|
|
)
|
|
|
|
assert image3 is None
|