mirror of
				https://github.com/open-metadata/OpenMetadata.git
				synced 2025-10-31 02:29:03 +00:00 
			
		
		
		
	 a3d6c1dd20
			
		
	
	
		a3d6c1dd20
		
			
		
	
	
	
	
		
			
			* tests(datalake): use minio 1. use minio instead of moto for mimicking s3 behavior. 2. removed moto dependency as it is not compatible with aiobotocore (https://github.com/getmoto/moto/issues/7070#issuecomment-1828484982) * - moved test_datalake_profiler_e2e.py to datalake/test_profiler - use minio instead of moto * fixed tests * fixed tests * removed default name for minio container
		
			
				
	
	
		
			90 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			90 lines
		
	
	
		
			3.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #  Copyright 2021 Collate
 | |
| #  Licensed under the Apache License, Version 2.0 (the "License");
 | |
| #  you may not use this file except in compliance with the License.
 | |
| #  You may obtain a copy of the License at
 | |
| #  http://www.apache.org/licenses/LICENSE-2.0
 | |
| #  Unless required by applicable law or agreed to in writing, software
 | |
| #  distributed under the License is distributed on an "AS IS" BASIS,
 | |
| #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| #  See the License for the specific language governing permissions and
 | |
| #  limitations under the License.
 | |
| 
 | |
| """Datalake ingestion integration tests"""
 | |
| 
 | |
| import pytest
 | |
| 
 | |
| from ingestion.tests.integration.datalake.conftest import BUCKET_NAME
 | |
| from metadata.generated.schema.entity.data.table import DataType, Table
 | |
| from metadata.ingestion.ometa.models import EntityList
 | |
| from metadata.ingestion.ometa.ometa_api import OpenMetadata
 | |
| 
 | |
| 
 | |
| class TestDatalake:
 | |
|     """datalake profiler E2E test"""
 | |
| 
 | |
|     metadata: OpenMetadata = None
 | |
|     s3_client = None
 | |
| 
 | |
|     @pytest.fixture(autouse=True)
 | |
|     def set_metadata(self, metadata):
 | |
|         self.metadata = metadata
 | |
| 
 | |
|     @pytest.mark.order(10000)
 | |
|     def test_ingestion(self, run_ingestion):
 | |
|         """test ingestion of datalake data"""
 | |
|         # Ingest our S3 data
 | |
|         resp: EntityList[Table] = self.metadata.list_entities(
 | |
|             entity=Table, params={"database": "datalake_for_integration_tests.default"}
 | |
|         )  # type: ignore
 | |
| 
 | |
|         entities = resp.entities
 | |
|         assert len(entities) == 5
 | |
|         names = [entity.name.root for entity in entities]
 | |
|         assert {
 | |
|             "names.json",
 | |
|             "names.jsonl",
 | |
|             "new_users.parquet",
 | |
|             "users.csv",
 | |
|             "profiler_test_.csv",
 | |
|         } == set(names)
 | |
| 
 | |
|         for entity in entities:
 | |
|             columns = entity.columns
 | |
|             for column in columns:
 | |
|                 if column.dataType == DataType.JSON:
 | |
|                     assert column.children
 | |
| 
 | |
|     def test_profiler(self, run_profiler):
 | |
|         """Also excluding the test for parquet files until the above is fixed"""
 | |
|         csv_ = self.metadata.get_by_name(
 | |
|             entity=Table,
 | |
|             fqn=f'datalake_for_integration_tests.default.{BUCKET_NAME}."users.csv"',
 | |
|             fields=["tableProfilerConfig"],
 | |
|         )
 | |
|         # parquet_ = self.metadata.get_by_name(
 | |
|         #     entity=Table,
 | |
|         #     fqn='datalake_for_integration_tests.default.MyBucket."new_users.parquet"',
 | |
|         #     fields=["tableProfilerConfig"],
 | |
|         # )
 | |
|         json_ = self.metadata.get_by_name(
 | |
|             entity=Table,
 | |
|             fqn=f'datalake_for_integration_tests.default.{BUCKET_NAME}."names.json"',
 | |
|             fields=["tableProfilerConfig"],
 | |
|         )
 | |
| 
 | |
|         jsonl_ = self.metadata.get_by_name(
 | |
|             entity=Table,
 | |
|             fqn=f'datalake_for_integration_tests.default.{BUCKET_NAME}."names.jsonl"',
 | |
|             fields=["tableProfilerConfig"],
 | |
|         )
 | |
| 
 | |
|         csv_sample_data = self.metadata.get_sample_data(csv_)
 | |
|         # parquet_sample_data = self.metadata.get_sample_data(parquet_)
 | |
|         json_sample_data = self.metadata.get_sample_data(json_)
 | |
|         jsonl_sample_data = self.metadata.get_sample_data(jsonl_)
 | |
| 
 | |
|         assert csv_sample_data.sampleData.rows
 | |
|         # assert parquet_sample_data.sampleData.rows
 | |
|         assert json_sample_data.sampleData.rows
 | |
|         assert jsonl_sample_data.sampleData.rows
 |