| 
									
										
										
										
											2025-04-03 10:39:47 +05:30
										 |  |  | #  Copyright 2025 Collate | 
					
						
							|  |  |  | #  Licensed under the Collate Community License, Version 1.0 (the "License"); | 
					
						
							| 
									
										
										
										
											2024-05-16 10:03:27 +02:00
										 |  |  | #  you may not use this file except in compliance with the License. | 
					
						
							|  |  |  | #  You may obtain a copy of the License at | 
					
						
							| 
									
										
										
										
											2025-04-03 10:39:47 +05:30
										 |  |  | #  https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/LICENSE | 
					
						
							| 
									
										
										
										
											2024-05-16 10:03:27 +02:00
										 |  |  | #  Unless required by applicable law or agreed to in writing, software | 
					
						
							|  |  |  | #  distributed under the License is distributed on an "AS IS" BASIS, | 
					
						
							|  |  |  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
					
						
							|  |  |  | #  See the License for the specific language governing permissions and | 
					
						
							|  |  |  | #  limitations under the License. | 
					
						
							|  |  |  | """S3 integration tests""" | 
					
						
							|  |  |  | import sys | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | import pytest | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | from metadata.generated.schema.entity.data.container import Container, FileFormat | 
					
						
							|  |  |  | from metadata.generated.schema.entity.services.storageService import StorageService | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | @pytest.mark.skipif( | 
					
						
							|  |  |  |     sys.version_info < (3, 9), | 
					
						
							|  |  |  |     reason="testcontainers Network feature requires python3.9 or higher", | 
					
						
							|  |  |  | ) | 
					
						
							|  |  |  | def test_s3_ingestion(metadata, ingest_s3_storage, service_name): | 
					
						
							|  |  |  |     """Test the ingestion is working as expected""" | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     service: StorageService = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=StorageService, fqn=service_name | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert service | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # We should have the bucket and all its structured children | 
					
						
							|  |  |  |     bucket: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, fqn=f"{service_name}.test-bucket", fields=["*"] | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     # The bucket has children and no dataModel | 
					
						
							| 
									
										
										
										
											2024-07-31 00:05:58 +05:30
										 |  |  |     assert 7 == len(bucket.children.root) | 
					
						
							| 
									
										
										
										
											2024-05-16 10:03:27 +02:00
										 |  |  |     assert not bucket.dataModel | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # We can validate the children | 
					
						
							|  |  |  |     cities: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, fqn=f"{service_name}.test-bucket.cities", fields=["*"] | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert cities.dataModel.isPartitioned | 
					
						
							|  |  |  |     assert 9 == len(cities.dataModel.columns) | 
					
						
							|  |  |  |     assert FileFormat.parquet in cities.fileFormats | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     cities_multiple: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, | 
					
						
							|  |  |  |         fqn=f"{service_name}.test-bucket.cities_multiple", | 
					
						
							|  |  |  |         fields=["*"], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert cities_multiple.dataModel.isPartitioned | 
					
						
							|  |  |  |     assert 11 == len(cities_multiple.dataModel.columns) | 
					
						
							|  |  |  |     assert FileFormat.parquet in cities_multiple.fileFormats | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     cities_multiple_simple: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, | 
					
						
							|  |  |  |         fqn=f"{service_name}.test-bucket.cities_multiple_simple", | 
					
						
							|  |  |  |         fields=["*"], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert cities_multiple_simple.dataModel.isPartitioned | 
					
						
							|  |  |  |     assert 10 == len(cities_multiple_simple.dataModel.columns) | 
					
						
							|  |  |  |     assert FileFormat.parquet in cities_multiple_simple.fileFormats | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     transactions: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, fqn=f"{service_name}.test-bucket.transactions", fields=["*"] | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert not transactions.dataModel.isPartitioned | 
					
						
							|  |  |  |     assert 2 == len(transactions.dataModel.columns) | 
					
						
							|  |  |  |     assert FileFormat.csv in transactions.fileFormats | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     transactions_separator: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, | 
					
						
							|  |  |  |         fqn=f"{service_name}.test-bucket.transactions_separator", | 
					
						
							|  |  |  |         fields=["*"], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert not transactions_separator.dataModel.isPartitioned | 
					
						
							|  |  |  |     assert 2 == len(transactions_separator.dataModel.columns) | 
					
						
							|  |  |  |     assert FileFormat.csv in transactions_separator.fileFormats | 
					
						
							| 
									
										
										
										
											2024-07-08 15:24:39 +05:30
										 |  |  | 
 | 
					
						
							|  |  |  |     png_file: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, | 
					
						
							|  |  |  |         fqn=f'{service_name}.test-bucket."solved.png"', | 
					
						
							|  |  |  |         fields=["*"], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert not png_file.dataModel | 
					
						
							|  |  |  |     assert png_file.size > 1000 | 
					
						
							| 
									
										
										
										
											2024-07-31 00:05:58 +05:30
										 |  |  | 
 | 
					
						
							|  |  |  |     # validate unstructured parent containers | 
					
						
							|  |  |  |     container1: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, | 
					
						
							|  |  |  |         fqn=f"{service_name}.test-bucket.docs_images", | 
					
						
							|  |  |  |         fields=["*"], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert not container1.dataModel | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     container2: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, | 
					
						
							|  |  |  |         fqn=f"{service_name}.test-bucket.docs_images.storage", | 
					
						
							|  |  |  |         fields=["*"], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert not container2.dataModel | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     container3: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, | 
					
						
							|  |  |  |         fqn=f"{service_name}.test-bucket.docs_images.storage.s3", | 
					
						
							|  |  |  |         fields=["*"], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert not container3.dataModel | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     # validate images container | 
					
						
							|  |  |  |     image1: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, | 
					
						
							|  |  |  |         fqn=f'{service_name}.test-bucket.docs_images.storage.s3."add-new-service.png"', | 
					
						
							|  |  |  |         fields=["*"], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert not image1.dataModel | 
					
						
							|  |  |  |     assert image1.size > 100 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     image1: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, | 
					
						
							|  |  |  |         fqn=f'{service_name}.test-bucket.docs_images.storage."s3-demo.png"', | 
					
						
							|  |  |  |         fields=["*"], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert not image1.dataModel | 
					
						
							|  |  |  |     assert image1.size > 100 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     image2: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, | 
					
						
							|  |  |  |         fqn=f'{service_name}.test-bucket.docs_images.synapse."add-new-service.webp"', | 
					
						
							|  |  |  |         fields=["*"], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert not image2.dataModel | 
					
						
							|  |  |  |     assert image2.size > 100 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     image3: Container = metadata.get_by_name( | 
					
						
							|  |  |  |         entity=Container, | 
					
						
							|  |  |  |         fqn=f'{service_name}.test-bucket.docs_images.domodatabase."scopes.jpeg"', | 
					
						
							|  |  |  |         fields=["*"], | 
					
						
							|  |  |  |     ) | 
					
						
							|  |  |  |     assert image3 is None |