mirror of
				https://github.com/datahub-project/datahub.git
				synced 2025-10-31 18:59:23 +00:00 
			
		
		
		
	fix(ingestion/dremio): Ignore filtered containers in schema allowdeny pattern (#11959)
Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
This commit is contained in:
		
							parent
							
								
									7c1d3b09ed
								
							
						
					
					
						commit
						06edf23a33
					
				| @ -1,6 +1,7 @@ | |||||||
| import concurrent.futures | import concurrent.futures | ||||||
| import json | import json | ||||||
| import logging | import logging | ||||||
|  | import re | ||||||
| import warnings | import warnings | ||||||
| from collections import defaultdict | from collections import defaultdict | ||||||
| from enum import Enum | from enum import Enum | ||||||
| @ -609,32 +610,6 @@ class DremioAPIOperations: | |||||||
| 
 | 
 | ||||||
|         return self.execute_query(query=jobs_query) |         return self.execute_query(query=jobs_query) | ||||||
| 
 | 
 | ||||||
|     def get_source_by_id(self, source_id: str) -> Optional[Dict]: |  | ||||||
|         """ |  | ||||||
|         Fetch source details by ID. |  | ||||||
|         """ |  | ||||||
|         response = self.get( |  | ||||||
|             url=f"/source/{source_id}", |  | ||||||
|         ) |  | ||||||
|         return response if response else None |  | ||||||
| 
 |  | ||||||
|     def get_source_for_dataset(self, schema: str, dataset: str) -> Optional[Dict]: |  | ||||||
|         """ |  | ||||||
|         Get source information for a dataset given its schema and name. |  | ||||||
|         """ |  | ||||||
|         dataset_id = self.get_dataset_id(schema, dataset) |  | ||||||
|         if not dataset_id: |  | ||||||
|             return None |  | ||||||
| 
 |  | ||||||
|         catalog_entry = self.get( |  | ||||||
|             url=f"/catalog/{dataset_id}", |  | ||||||
|         ) |  | ||||||
|         if not catalog_entry or "path" not in catalog_entry: |  | ||||||
|             return None |  | ||||||
| 
 |  | ||||||
|         source_id = catalog_entry["path"][0] |  | ||||||
|         return self.get_source_by_id(source_id) |  | ||||||
| 
 |  | ||||||
|     def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]: |     def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]: | ||||||
|         """ |         """ | ||||||
|         Get Dremio tags for a given resource_id. |         Get Dremio tags for a given resource_id. | ||||||
| @ -673,55 +648,119 @@ class DremioAPIOperations: | |||||||
|             ) |             ) | ||||||
|         return None |         return None | ||||||
| 
 | 
 | ||||||
|     def get_containers_for_location( |     def _check_pattern_match( | ||||||
|         self, resource_id: str, path: List[str] |         self, | ||||||
|     ) -> List[Dict[str, str]]: |         pattern: str, | ||||||
|         containers = [] |         paths: List[str], | ||||||
|  |         allow_prefix: bool = True, | ||||||
|  |     ) -> bool: | ||||||
|  |         """ | ||||||
|  |         Helper method to check if a pattern matches any of the paths. | ||||||
|  |         Handles hierarchical matching where each level is matched independently. | ||||||
|  |         Also handles prefix matching for partial paths. | ||||||
|  |         """ | ||||||
|  |         if pattern == ".*": | ||||||
|  |             return True | ||||||
| 
 | 
 | ||||||
|         def traverse_path(location_id: str, entity_path: List[str]) -> List: |         # Convert the pattern to regex with proper anchoring | ||||||
|             nonlocal containers |         regex_pattern = pattern | ||||||
|             try: |         if pattern.startswith("^"): | ||||||
|                 response = self.get(url=f"/catalog/{location_id}") |             # Already has start anchor | ||||||
|                 if ( |             regex_pattern = pattern.replace(".", r"\.")  # Escape dots | ||||||
|                     response.get("entityType") |             regex_pattern = regex_pattern.replace( | ||||||
|                     == DremioEntityContainerType.FOLDER.value.lower() |                 r"\.*", ".*" | ||||||
|  |             )  # Convert .* to wildcard | ||||||
|  |         else: | ||||||
|  |             # Add start anchor and handle dots | ||||||
|  |             regex_pattern = "^" + pattern.replace(".", r"\.").replace(r"\.*", ".*") | ||||||
|  | 
 | ||||||
|  |         # Handle end matching | ||||||
|  |         if not pattern.endswith(".*"): | ||||||
|  |             if pattern.endswith("$"): | ||||||
|  |                 # Keep explicit end anchor | ||||||
|  |                 pass | ||||||
|  |             elif not allow_prefix: | ||||||
|  |                 # Add end anchor for exact matching | ||||||
|  |                 regex_pattern = regex_pattern + "$" | ||||||
|  | 
 | ||||||
|  |         for path in paths: | ||||||
|  |             if re.match(regex_pattern, path, re.IGNORECASE): | ||||||
|  |                 return True | ||||||
|  | 
 | ||||||
|  |         return False | ||||||
|  | 
 | ||||||
|  |     def should_include_container(self, path: List[str], name: str) -> bool: | ||||||
|  |         """ | ||||||
|  |         Helper method to check if a container should be included based on schema patterns. | ||||||
|  |         Used by both get_all_containers and get_containers_for_location. | ||||||
|  |         """ | ||||||
|  |         path_components = path + [name] if path else [name] | ||||||
|  |         full_path = ".".join(path_components) | ||||||
|  | 
 | ||||||
|  |         # Default allow everything case | ||||||
|  |         if self.allow_schema_pattern == [".*"] and not self.deny_schema_pattern: | ||||||
|  |             self.report.report_container_scanned(full_path) | ||||||
|  |             return True | ||||||
|  | 
 | ||||||
|  |         # Check deny patterns first | ||||||
|  |         if self.deny_schema_pattern: | ||||||
|  |             for pattern in self.deny_schema_pattern: | ||||||
|  |                 if self._check_pattern_match( | ||||||
|  |                     pattern=pattern, | ||||||
|  |                     paths=[full_path], | ||||||
|  |                     allow_prefix=False, | ||||||
|                 ): |                 ): | ||||||
|                     containers.append( |                     self.report.report_container_filtered(full_path) | ||||||
|                         { |                     return False | ||||||
|                             "id": location_id, |  | ||||||
|                             "name": entity_path[-1], |  | ||||||
|                             "path": entity_path[:-1], |  | ||||||
|                             "container_type": DremioEntityContainerType.FOLDER, |  | ||||||
|                         } |  | ||||||
|                     ) |  | ||||||
| 
 | 
 | ||||||
|                 for container in response.get("children", []): |         # Check allow patterns | ||||||
|                     if ( |         for pattern in self.allow_schema_pattern: | ||||||
|                         container.get("type") |             # For patterns with wildcards, check if this path is a parent of the pattern | ||||||
|                         == DremioEntityContainerType.CONTAINER.value |             if "*" in pattern: | ||||||
|  |                 pattern_parts = pattern.split(".") | ||||||
|  |                 path_parts = path_components | ||||||
|  | 
 | ||||||
|  |                 # If pattern has exact same number of parts, check each component | ||||||
|  |                 if len(pattern_parts) == len(path_parts): | ||||||
|  |                     matches = True | ||||||
|  |                     for p_part, c_part in zip(pattern_parts, path_parts): | ||||||
|  |                         if p_part != "*" and p_part.lower() != c_part.lower(): | ||||||
|  |                             matches = False | ||||||
|  |                             break | ||||||
|  |                     if matches: | ||||||
|  |                         self.report.report_container_scanned(full_path) | ||||||
|  |                         return True | ||||||
|  |                 # Otherwise check if current path is prefix match | ||||||
|  |                 else: | ||||||
|  |                     # Remove the trailing wildcard if present | ||||||
|  |                     if pattern_parts[-1] == "*": | ||||||
|  |                         pattern_parts = pattern_parts[:-1] | ||||||
|  | 
 | ||||||
|  |                     for i in range(len(path_parts)): | ||||||
|  |                         current_path = ".".join(path_parts[: i + 1]) | ||||||
|  |                         pattern_prefix = ".".join(pattern_parts[: i + 1]) | ||||||
|  | 
 | ||||||
|  |                         if pattern_prefix.startswith(current_path): | ||||||
|  |                             self.report.report_container_scanned(full_path) | ||||||
|  |                             return True | ||||||
|  | 
 | ||||||
|  |             # Direct pattern matching | ||||||
|  |             if self._check_pattern_match( | ||||||
|  |                 pattern=pattern, | ||||||
|  |                 paths=[full_path], | ||||||
|  |                 allow_prefix=True, | ||||||
|             ): |             ): | ||||||
|                         traverse_path(container.get("id"), container.get("path")) |                 self.report.report_container_scanned(full_path) | ||||||
|  |                 return True | ||||||
| 
 | 
 | ||||||
|             except Exception as exc: |         self.report.report_container_filtered(full_path) | ||||||
|                 logging.info( |         return False | ||||||
|                     "Location {} contains no tables or views. Skipping...".format(id) |  | ||||||
|                 ) |  | ||||||
|                 self.report.warning( |  | ||||||
|                     message="Failed to get tables or views", |  | ||||||
|                     context=f"{id}", |  | ||||||
|                     exc=exc, |  | ||||||
|                 ) |  | ||||||
| 
 |  | ||||||
|             return containers |  | ||||||
| 
 |  | ||||||
|         return traverse_path(location_id=resource_id, entity_path=path) |  | ||||||
| 
 | 
 | ||||||
|     def get_all_containers(self): |     def get_all_containers(self): | ||||||
|         """ |         """ | ||||||
|         Query the Dremio sources API and return source information. |         Query the Dremio sources API and return filtered source information. | ||||||
|         """ |         """ | ||||||
|         containers = [] |         containers = [] | ||||||
| 
 |  | ||||||
|         response = self.get(url="/catalog") |         response = self.get(url="/catalog") | ||||||
| 
 | 
 | ||||||
|         def process_source(source): |         def process_source(source): | ||||||
| @ -731,11 +770,11 @@ class DremioAPIOperations: | |||||||
|                 ) |                 ) | ||||||
| 
 | 
 | ||||||
|                 source_config = source_resp.get("config", {}) |                 source_config = source_resp.get("config", {}) | ||||||
|                 if source_config.get("database"): |                 db = source_config.get( | ||||||
|                     db = source_config.get("database") |                     "database", source_config.get("databaseName", "") | ||||||
|                 else: |                 ) | ||||||
|                     db = source_config.get("databaseName", "") |  | ||||||
| 
 | 
 | ||||||
|  |                 if self.should_include_container([], source.get("path")[0]): | ||||||
|                     return { |                     return { | ||||||
|                         "id": source.get("id"), |                         "id": source.get("id"), | ||||||
|                         "name": source.get("path")[0], |                         "name": source.get("path")[0], | ||||||
| @ -746,19 +785,26 @@ class DremioAPIOperations: | |||||||
|                         "database_name": db, |                         "database_name": db, | ||||||
|                     } |                     } | ||||||
|             else: |             else: | ||||||
|  |                 if self.should_include_container([], source.get("path")[0]): | ||||||
|                     return { |                     return { | ||||||
|                         "id": source.get("id"), |                         "id": source.get("id"), | ||||||
|                         "name": source.get("path")[0], |                         "name": source.get("path")[0], | ||||||
|                         "path": [], |                         "path": [], | ||||||
|                         "container_type": DremioEntityContainerType.SPACE, |                         "container_type": DremioEntityContainerType.SPACE, | ||||||
|                     } |                     } | ||||||
|  |             return None | ||||||
| 
 | 
 | ||||||
|         def process_source_and_containers(source): |         def process_source_and_containers(source): | ||||||
|             container = process_source(source) |             container = process_source(source) | ||||||
|  |             if not container: | ||||||
|  |                 return [] | ||||||
|  | 
 | ||||||
|  |             # Get sub-containers | ||||||
|             sub_containers = self.get_containers_for_location( |             sub_containers = self.get_containers_for_location( | ||||||
|                 resource_id=container.get("id"), |                 resource_id=container.get("id"), | ||||||
|                 path=[container.get("name")], |                 path=[container.get("name")], | ||||||
|             ) |             ) | ||||||
|  | 
 | ||||||
|             return [container] + sub_containers |             return [container] + sub_containers | ||||||
| 
 | 
 | ||||||
|         # Use ThreadPoolExecutor to parallelize the processing of sources |         # Use ThreadPoolExecutor to parallelize the processing of sources | ||||||
| @ -771,7 +817,16 @@ class DremioAPIOperations: | |||||||
|             } |             } | ||||||
| 
 | 
 | ||||||
|             for future in concurrent.futures.as_completed(future_to_source): |             for future in concurrent.futures.as_completed(future_to_source): | ||||||
|  |                 source = future_to_source[future] | ||||||
|  |                 try: | ||||||
|                     containers.extend(future.result()) |                     containers.extend(future.result()) | ||||||
|  |                 except Exception as exc: | ||||||
|  |                     logger.error(f"Error processing source: {exc}") | ||||||
|  |                     self.report.warning( | ||||||
|  |                         message="Failed to process source", | ||||||
|  |                         context=f"{source}", | ||||||
|  |                         exc=exc, | ||||||
|  |                     ) | ||||||
| 
 | 
 | ||||||
|         return containers |         return containers | ||||||
| 
 | 
 | ||||||
| @ -785,3 +840,55 @@ class DremioAPIOperations: | |||||||
|             ) |             ) | ||||||
|         else: |         else: | ||||||
|             return "" |             return "" | ||||||
|  | 
 | ||||||
|  |     def get_containers_for_location( | ||||||
|  |         self, resource_id: str, path: List[str] | ||||||
|  |     ) -> List[Dict[str, str]]: | ||||||
|  |         containers = [] | ||||||
|  | 
 | ||||||
|  |         def traverse_path(location_id: str, entity_path: List[str]) -> List: | ||||||
|  |             nonlocal containers | ||||||
|  |             try: | ||||||
|  |                 response = self.get(url=f"/catalog/{location_id}") | ||||||
|  | 
 | ||||||
|  |                 # Check if current folder should be included | ||||||
|  |                 if ( | ||||||
|  |                     response.get("entityType") | ||||||
|  |                     == DremioEntityContainerType.FOLDER.value.lower() | ||||||
|  |                 ): | ||||||
|  |                     folder_name = entity_path[-1] | ||||||
|  |                     folder_path = entity_path[:-1] | ||||||
|  | 
 | ||||||
|  |                     if self.should_include_container(folder_path, folder_name): | ||||||
|  |                         containers.append( | ||||||
|  |                             { | ||||||
|  |                                 "id": location_id, | ||||||
|  |                                 "name": folder_name, | ||||||
|  |                                 "path": folder_path, | ||||||
|  |                                 "container_type": DremioEntityContainerType.FOLDER, | ||||||
|  |                             } | ||||||
|  |                         ) | ||||||
|  | 
 | ||||||
|  |                 # Recursively process child containers | ||||||
|  |                 for container in response.get("children", []): | ||||||
|  |                     if ( | ||||||
|  |                         container.get("type") | ||||||
|  |                         == DremioEntityContainerType.CONTAINER.value | ||||||
|  |                     ): | ||||||
|  |                         traverse_path(container.get("id"), container.get("path")) | ||||||
|  | 
 | ||||||
|  |             except Exception as exc: | ||||||
|  |                 logging.info( | ||||||
|  |                     "Location {} contains no tables or views. Skipping...".format( | ||||||
|  |                         location_id | ||||||
|  |                     ) | ||||||
|  |                 ) | ||||||
|  |                 self.report.warning( | ||||||
|  |                     message="Failed to get tables or views", | ||||||
|  |                     context=f"{location_id}", | ||||||
|  |                     exc=exc, | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|  |             return containers | ||||||
|  | 
 | ||||||
|  |         return traverse_path(location_id=resource_id, entity_path=path) | ||||||
|  | |||||||
| @ -31,6 +31,7 @@ class DremioToDataHubSourceTypeMapping: | |||||||
|         "SNOWFLAKE": "snowflake", |         "SNOWFLAKE": "snowflake", | ||||||
|         "SYNAPSE": "mssql", |         "SYNAPSE": "mssql", | ||||||
|         "TERADATA": "teradata", |         "TERADATA": "teradata", | ||||||
|  |         "VERTICA": "vertica", | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     DATABASE_SOURCE_TYPES = { |     DATABASE_SOURCE_TYPES = { | ||||||
| @ -52,6 +53,7 @@ class DremioToDataHubSourceTypeMapping: | |||||||
|         "SNOWFLAKE", |         "SNOWFLAKE", | ||||||
|         "SYNAPSE", |         "SYNAPSE", | ||||||
|         "TERADATA", |         "TERADATA", | ||||||
|  |         "VERTICA", | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     FILE_OBJECT_STORAGE_TYPES = { |     FILE_OBJECT_STORAGE_TYPES = { | ||||||
|  | |||||||
| @ -14,12 +14,27 @@ class DremioSourceReport( | |||||||
| ): | ): | ||||||
|     num_containers_failed: int = 0 |     num_containers_failed: int = 0 | ||||||
|     num_datasets_failed: int = 0 |     num_datasets_failed: int = 0 | ||||||
|  |     containers_scanned: int = 0 | ||||||
|  |     containers_filtered: int = 0 | ||||||
| 
 | 
 | ||||||
|     def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None: |     def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None: | ||||||
|         # recording total combined latency is not very useful, keeping this method as a placeholder |         # recording total combined latency is not very useful, keeping this method as a placeholder | ||||||
|         # for future implementation of min / max / percentiles etc. |         # for future implementation of min / max / percentiles etc. | ||||||
|         pass |         pass | ||||||
| 
 | 
 | ||||||
|  |     def report_container_scanned(self, name: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Record that a container was successfully scanned | ||||||
|  |         """ | ||||||
|  |         self.containers_scanned += 1 | ||||||
|  | 
 | ||||||
|  |     def report_container_filtered(self, container_name: str) -> None: | ||||||
|  |         """ | ||||||
|  |         Record that a container was filtered out | ||||||
|  |         """ | ||||||
|  |         self.containers_filtered += 1 | ||||||
|  |         self.report_dropped(container_name) | ||||||
|  | 
 | ||||||
|     def report_entity_scanned(self, name: str, ent_type: str = "View") -> None: |     def report_entity_scanned(self, name: str, ent_type: str = "View") -> None: | ||||||
|         """ |         """ | ||||||
|         Entity could be a view or a table |         Entity could be a view or a table | ||||||
|  | |||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,26 @@ | |||||||
|  | source: | ||||||
|  |   type: dremio | ||||||
|  |   config: | ||||||
|  |     # Coordinates | ||||||
|  |     hostname: localhost | ||||||
|  |     port: 9047 | ||||||
|  |     tls: false | ||||||
|  | 
 | ||||||
|  |     # Credentials | ||||||
|  |     authentication_method: password | ||||||
|  |     username: admin | ||||||
|  |     password: "2310Admin1234!@" | ||||||
|  | 
 | ||||||
|  |     platform_instance: test-platform | ||||||
|  | 
 | ||||||
|  |     include_query_lineage: false | ||||||
|  | 
 | ||||||
|  |     source_mappings: | ||||||
|  |       - platform: s3 | ||||||
|  |         source_name: samples | ||||||
|  |         platform_instance: s3_test_samples | ||||||
|  | 
 | ||||||
|  | sink: | ||||||
|  |   type: file | ||||||
|  |   config: | ||||||
|  |     filename: "./dremio_mces.json" | ||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -0,0 +1,28 @@ | |||||||
|  | source: | ||||||
|  |   type: dremio | ||||||
|  |   config: | ||||||
|  |     # Coordinates | ||||||
|  |     hostname: localhost | ||||||
|  |     port: 9047 | ||||||
|  |     tls: false | ||||||
|  | 
 | ||||||
|  |     # Credentials | ||||||
|  |     authentication_method: password | ||||||
|  |     username: admin | ||||||
|  |     password: "2310Admin1234!@" | ||||||
|  | 
 | ||||||
|  |     include_query_lineage: false | ||||||
|  | 
 | ||||||
|  |     source_mappings: | ||||||
|  |       - platform: s3 | ||||||
|  |         source_name: samples | ||||||
|  |         platform_instance: s3_test_samples | ||||||
|  | 
 | ||||||
|  |     schema_pattern: | ||||||
|  |       allow: | ||||||
|  |         - "Samples" | ||||||
|  | 
 | ||||||
|  | sink: | ||||||
|  |   type: file | ||||||
|  |   config: | ||||||
|  |     filename: "./dremio_mces.json" | ||||||
| @ -1,6 +1,7 @@ | |||||||
| import json | import json | ||||||
| import os | import os | ||||||
| import subprocess | import subprocess | ||||||
|  | from typing import Dict | ||||||
| 
 | 
 | ||||||
| import boto3 | import boto3 | ||||||
| import pytest | import pytest | ||||||
| @ -75,9 +76,10 @@ def create_spaces_and_folders(headers): | |||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def create_sample_source(headers): | def create_sample_source(headers): | ||||||
|     url = f"{DREMIO_HOST}/apiv2/source/Samples" |     url = f"{DREMIO_HOST}/api/v3/catalog" | ||||||
| 
 | 
 | ||||||
|     payload = { |     payload = { | ||||||
|  |         "entityType": "source", | ||||||
|         "config": { |         "config": { | ||||||
|             "externalBucketList": ["samples.dremio.com"], |             "externalBucketList": ["samples.dremio.com"], | ||||||
|             "credentialType": "NONE", |             "credentialType": "NONE", | ||||||
| @ -95,14 +97,15 @@ def create_sample_source(headers): | |||||||
|         "type": "S3", |         "type": "S3", | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     response = requests.put(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
|     assert response.status_code == 200, f"Failed to add dataset: {response.text}" |     assert response.status_code == 200, f"Failed to add dataset: {response.text}" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def create_s3_source(headers): | def create_s3_source(headers): | ||||||
|     url = f"{DREMIO_HOST}/apiv2/source/s3" |     url = f"{DREMIO_HOST}/api/v3/catalog" | ||||||
| 
 | 
 | ||||||
|     payload = { |     payload = { | ||||||
|  |         "entityType": "source", | ||||||
|         "name": "s3", |         "name": "s3", | ||||||
|         "config": { |         "config": { | ||||||
|             "credentialType": "ACCESS_KEY", |             "credentialType": "ACCESS_KEY", | ||||||
| @ -139,24 +142,25 @@ def create_s3_source(headers): | |||||||
|         "metadataPolicy": { |         "metadataPolicy": { | ||||||
|             "deleteUnavailableDatasets": True, |             "deleteUnavailableDatasets": True, | ||||||
|             "autoPromoteDatasets": False, |             "autoPromoteDatasets": False, | ||||||
|             "namesRefreshMillis": 3600000, |             "namesRefreshMs": 3600000, | ||||||
|             "datasetDefinitionRefreshAfterMillis": 3600000, |             "datasetRefreshAfterMs": 3600000, | ||||||
|             "datasetDefinitionExpireAfterMillis": 10800000, |             "datasetExpireAfterMs": 10800000, | ||||||
|             "authTTLMillis": 86400000, |             "authTTLMs": 86400000, | ||||||
|             "updateMode": "PREFETCH_QUERIED", |             "datasetUpdateMode": "PREFETCH_QUERIED", | ||||||
|         }, |         }, | ||||||
|         "type": "S3", |         "type": "S3", | ||||||
|         "accessControlList": {"userControls": [], "roleControls": []}, |         "accessControlList": {"userControls": [], "roleControls": []}, | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     response = requests.put(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
|     assert response.status_code == 200, f"Failed to add s3 datasource: {response.text}" |     assert response.status_code == 200, f"Failed to add s3 datasource: {response.text}" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def create_mysql_source(headers): | def create_mysql_source(headers): | ||||||
|     url = f"{DREMIO_HOST}/apiv2/source/mysql" |     url = f"{DREMIO_HOST}/api/v3/catalog" | ||||||
| 
 | 
 | ||||||
|     payload = { |     payload = { | ||||||
|  |         "entityType": "source", | ||||||
|         "config": { |         "config": { | ||||||
|             "username": "root", |             "username": "root", | ||||||
|             "password": "rootpwd123", |             "password": "rootpwd123", | ||||||
| @ -169,7 +173,7 @@ def create_mysql_source(headers): | |||||||
|             "maxIdleConns": 8, |             "maxIdleConns": 8, | ||||||
|             "idleTimeSec": 60, |             "idleTimeSec": 60, | ||||||
|         }, |         }, | ||||||
|         "name": "mysql-source", |         "name": "mysql", | ||||||
|         "accelerationRefreshPeriod": 3600000, |         "accelerationRefreshPeriod": 3600000, | ||||||
|         "accelerationGracePeriod": 10800000, |         "accelerationGracePeriod": 10800000, | ||||||
|         "accelerationActivePolicyType": "PERIOD", |         "accelerationActivePolicyType": "PERIOD", | ||||||
| @ -177,30 +181,46 @@ def create_mysql_source(headers): | |||||||
|         "accelerationRefreshOnDataChanges": False, |         "accelerationRefreshOnDataChanges": False, | ||||||
|         "metadataPolicy": { |         "metadataPolicy": { | ||||||
|             "deleteUnavailableDatasets": True, |             "deleteUnavailableDatasets": True, | ||||||
|             "namesRefreshMillis": 3600000, |             "namesRefreshMs": 3600000, | ||||||
|             "datasetDefinitionRefreshAfterMillis": 3600000, |             "datasetRefreshAfterMs": 3600000, | ||||||
|             "datasetDefinitionExpireAfterMillis": 10800000, |             "datasetExpireAfterMs": 10800000, | ||||||
|             "authTTLMillis": 86400000, |             "authTTLMs": 86400000, | ||||||
|             "updateMode": "PREFETCH_QUERIED", |             "datasetUpdateMode": "PREFETCH_QUERIED", | ||||||
|         }, |         }, | ||||||
|         "type": "MYSQL", |         "type": "MYSQL", | ||||||
|     } |     } | ||||||
|     response = requests.put(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
|     assert ( |     assert ( | ||||||
|         response.status_code == 200 |         response.status_code == 200 | ||||||
|     ), f"Failed to add mysql datasource: {response.text}" |     ), f"Failed to add mysql datasource: {response.text}" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def upload_dataset(headers): | def upload_dataset(headers): | ||||||
|     url = f"{DREMIO_HOST}/apiv2/source/s3/file_format/warehouse/sample.parquet" |     url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2Fs3%2Fwarehouse" | ||||||
|     payload = {"ignoreOtherFileFormats": False, "type": "Parquet"} |     payload = { | ||||||
|  |         "entityType": "dataset", | ||||||
|  |         "type": "PHYSICAL_DATASET", | ||||||
|  |         "path": [ | ||||||
|  |             "s3", | ||||||
|  |             "warehouse", | ||||||
|  |         ], | ||||||
|  |         "format": {"type": "Parquet"}, | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     response = requests.put(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
|     assert response.status_code == 200, f"Failed to add dataset: {response.text}" |     assert response.status_code == 200, f"Failed to add dataset: {response.text}" | ||||||
| 
 | 
 | ||||||
|     url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/NYC-weather.csv" |     url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2FNYC-weather.csv" | ||||||
| 
 | 
 | ||||||
|     payload = { |     payload = { | ||||||
|  |         "entityType": "dataset", | ||||||
|  |         "type": "PHYSICAL_DATASET", | ||||||
|  |         "path": [ | ||||||
|  |             "Samples", | ||||||
|  |             "samples.dremio.com", | ||||||
|  |             "NYC-weather.csv", | ||||||
|  |         ], | ||||||
|  |         "format": { | ||||||
|             "fieldDelimiter": ",", |             "fieldDelimiter": ",", | ||||||
|             "quote": '"', |             "quote": '"', | ||||||
|             "comment": "#", |             "comment": "#", | ||||||
| @ -210,21 +230,41 @@ def upload_dataset(headers): | |||||||
|             "trimHeader": True, |             "trimHeader": True, | ||||||
|             "skipFirstLine": False, |             "skipFirstLine": False, | ||||||
|             "type": "Text", |             "type": "Text", | ||||||
|  |         }, | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     response = requests.put(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
|     assert response.status_code == 200, f"Failed to add dataset: {response.text}" |     assert response.status_code == 200, f"Failed to add dataset: {response.text}" | ||||||
| 
 | 
 | ||||||
|     url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/Dremio%20University/oracle-departments.xlsx" |     url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2FDremio%20University%2Foracle-departments.xlsx" | ||||||
| 
 |  | ||||||
|     payload = {"extractHeader": True, "hasMergedCells": False, "type": "Excel"} |  | ||||||
| 
 |  | ||||||
|     response = requests.put(url, headers=headers, data=json.dumps(payload)) |  | ||||||
|     assert response.status_code == 200, f"Failed to add dataset: {response.text}" |  | ||||||
| 
 |  | ||||||
|     url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/Dremio%20University/googleplaystore.csv" |  | ||||||
| 
 | 
 | ||||||
|     payload = { |     payload = { | ||||||
|  |         "entityType": "dataset", | ||||||
|  |         "type": "PHYSICAL_DATASET", | ||||||
|  |         "path": [ | ||||||
|  |             "Samples", | ||||||
|  |             "samples.dremio.com", | ||||||
|  |             "Dremio University", | ||||||
|  |             "oracle-departments.xlsx", | ||||||
|  |         ], | ||||||
|  |         "format": {"extractHeader": True, "hasMergedCells": False, "type": "Excel"}, | ||||||
|  |     } | ||||||
|  | 
 | ||||||
|  |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
|  |     assert response.status_code == 200, f"Failed to add dataset: {response.text}" | ||||||
|  | 
 | ||||||
|  |     url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2FDremio%20University%2Fgoogleplaystore.csv" | ||||||
|  | 
 | ||||||
|  |     payload = { | ||||||
|  |         "entityType": "dataset", | ||||||
|  |         "type": "PHYSICAL_DATASET", | ||||||
|  |         "path": [ | ||||||
|  |             "Samples", | ||||||
|  |             "samples.dremio.com", | ||||||
|  |             "Dremio University", | ||||||
|  |             "googleplaystore.csv", | ||||||
|  |         ], | ||||||
|  |         "format": { | ||||||
|             "fieldDelimiter": ",", |             "fieldDelimiter": ",", | ||||||
|             "quote": '"', |             "quote": '"', | ||||||
|             "comment": "#", |             "comment": "#", | ||||||
| @ -234,15 +274,28 @@ def upload_dataset(headers): | |||||||
|             "trimHeader": True, |             "trimHeader": True, | ||||||
|             "skipFirstLine": False, |             "skipFirstLine": False, | ||||||
|             "type": "Text", |             "type": "Text", | ||||||
|  |         }, | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     response = requests.put(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
|     assert response.status_code == 200, f"Failed to add dataset: {response.text}" |     assert response.status_code == 200, f"Failed to add dataset: {response.text}" | ||||||
| 
 | 
 | ||||||
|     url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/tpcds_sf1000/catalog_page/1ab266d5-18eb-4780-711d-0fa337fa6c00/0_0_0.parquet" |     url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2Ftpcds_sf1000%2Fcatalog_page%2F1ab266d5-18eb-4780-711d-0fa337fa6c00%2F0_0_0.parquet" | ||||||
|     payload = {"ignoreOtherFileFormats": False, "type": "Parquet"} |     payload = { | ||||||
|  |         "entityType": "dataset", | ||||||
|  |         "type": "PHYSICAL_DATASET", | ||||||
|  |         "path": [ | ||||||
|  |             "Samples", | ||||||
|  |             "samples.dremio.com", | ||||||
|  |             "tpcds_sf1000", | ||||||
|  |             "catalog_page", | ||||||
|  |             "1ab266d5-18eb-4780-711d-0fa337fa6c00", | ||||||
|  |             "0_0_0.parquet", | ||||||
|  |         ], | ||||||
|  |         "format": {"type": "Parquet"}, | ||||||
|  |     } | ||||||
| 
 | 
 | ||||||
|     response = requests.put(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
|     assert response.status_code == 200, f"Failed to add dataset: {response.text}" |     assert response.status_code == 200, f"Failed to add dataset: {response.text}" | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @ -253,7 +306,7 @@ def create_view(headers): | |||||||
|         "entityType": "dataset", |         "entityType": "dataset", | ||||||
|         "type": "VIRTUAL_DATASET", |         "type": "VIRTUAL_DATASET", | ||||||
|         "path": ["space", "test_folder", "raw"], |         "path": ["space", "test_folder", "raw"], | ||||||
|         "sql": 'SELECT * FROM s3.warehouse."sample.parquet"', |         "sql": "SELECT * FROM s3.warehouse", | ||||||
|     } |     } | ||||||
|     response = requests.post(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
|     assert response.status_code == 200, f"Failed to create view: {response.text}" |     assert response.status_code == 200, f"Failed to create view: {response.text}" | ||||||
| @ -273,7 +326,7 @@ def create_view(headers): | |||||||
|         "entityType": "dataset", |         "entityType": "dataset", | ||||||
|         "type": "VIRTUAL_DATASET", |         "type": "VIRTUAL_DATASET", | ||||||
|         "path": ["space", "test_folder", "customers"], |         "path": ["space", "test_folder", "customers"], | ||||||
|         "sql": 'SELECT * FROM "mysql".northwind.customers', |         "sql": "SELECT * FROM mysql.northwind.customers", | ||||||
|         "sqlContext": ["mysql", "northwind"], |         "sqlContext": ["mysql", "northwind"], | ||||||
|     } |     } | ||||||
|     response = requests.post(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
| @ -283,7 +336,7 @@ def create_view(headers): | |||||||
|         "entityType": "dataset", |         "entityType": "dataset", | ||||||
|         "type": "VIRTUAL_DATASET", |         "type": "VIRTUAL_DATASET", | ||||||
|         "path": ["space", "test_folder", "orders"], |         "path": ["space", "test_folder", "orders"], | ||||||
|         "sql": 'SELECT * FROM "mysql".northwind.orders', |         "sql": "SELECT * FROM mysql.northwind.orders", | ||||||
|         "sqlContext": ["mysql", "northwind"], |         "sqlContext": ["mysql", "northwind"], | ||||||
|     } |     } | ||||||
|     response = requests.post(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
| @ -293,7 +346,7 @@ def create_view(headers): | |||||||
|         "entityType": "dataset", |         "entityType": "dataset", | ||||||
|         "type": "VIRTUAL_DATASET", |         "type": "VIRTUAL_DATASET", | ||||||
|         "path": ["space", "test_folder", "metadata_aspect"], |         "path": ["space", "test_folder", "metadata_aspect"], | ||||||
|         "sql": 'SELECT * FROM "mysql".metagalaxy."metadata_aspect"', |         "sql": "SELECT * FROM mysql.metagalaxy.metadata_aspect", | ||||||
|         "sqlContext": ["mysql", "metagalaxy"], |         "sqlContext": ["mysql", "metagalaxy"], | ||||||
|     } |     } | ||||||
|     response = requests.post(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
| @ -303,7 +356,7 @@ def create_view(headers): | |||||||
|         "entityType": "dataset", |         "entityType": "dataset", | ||||||
|         "type": "VIRTUAL_DATASET", |         "type": "VIRTUAL_DATASET", | ||||||
|         "path": ["space", "test_folder", "metadata_index"], |         "path": ["space", "test_folder", "metadata_index"], | ||||||
|         "sql": 'SELECT * FROM "mysql".metagalaxy."metadata_index"', |         "sql": "SELECT * FROM mysql.metagalaxy.metadata_index", | ||||||
|         "sqlContext": ["mysql", "metagalaxy"], |         "sqlContext": ["mysql", "metagalaxy"], | ||||||
|     } |     } | ||||||
|     response = requests.post(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
| @ -313,7 +366,7 @@ def create_view(headers): | |||||||
|         "entityType": "dataset", |         "entityType": "dataset", | ||||||
|         "type": "VIRTUAL_DATASET", |         "type": "VIRTUAL_DATASET", | ||||||
|         "path": ["space", "test_folder", "metadata_index_view"], |         "path": ["space", "test_folder", "metadata_index_view"], | ||||||
|         "sql": 'SELECT * FROM "mysql".metagalaxy."metadata_index_view"', |         "sql": "SELECT * FROM mysql.metagalaxy.metadata_index_view", | ||||||
|         "sqlContext": ["mysql", "metagalaxy"], |         "sqlContext": ["mysql", "metagalaxy"], | ||||||
|     } |     } | ||||||
|     response = requests.post(url, headers=headers, data=json.dumps(payload)) |     response = requests.post(url, headers=headers, data=json.dumps(payload)) | ||||||
| @ -422,14 +475,119 @@ def test_dremio_ingest( | |||||||
|     pytestconfig, |     pytestconfig, | ||||||
|     tmp_path, |     tmp_path, | ||||||
| ): | ): | ||||||
|     # Run the metadata ingestion pipeline. |     # Run the metadata ingestion pipeline with specific output file | ||||||
|     config_file = (test_resources_dir / "dremio_to_file.yml").resolve() |     config_file = (test_resources_dir / "dremio_to_file.yml").resolve() | ||||||
|  |     output_path = tmp_path / "dremio_mces.json" | ||||||
|  | 
 | ||||||
|     run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) |     run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) | ||||||
| 
 | 
 | ||||||
|     # Verify the output. |     # Verify the output | ||||||
|     mce_helpers.check_golden_file( |     mce_helpers.check_golden_file( | ||||||
|         pytestconfig, |         pytestconfig, | ||||||
|         output_path=tmp_path / "dremio_mces.json", |         output_path=output_path, | ||||||
|         golden_path=test_resources_dir / "dremio_mces_golden.json", |         golden_path=test_resources_dir / "dremio_mces_golden.json", | ||||||
|         ignore_paths=[], |         ignore_paths=[], | ||||||
|     ) |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time(FROZEN_TIME) | ||||||
|  | @pytest.mark.integration | ||||||
|  | def test_dremio_platform_instance_urns( | ||||||
|  |     test_resources_dir, | ||||||
|  |     dremio_setup, | ||||||
|  |     pytestconfig, | ||||||
|  |     tmp_path, | ||||||
|  | ): | ||||||
|  |     config_file = ( | ||||||
|  |         test_resources_dir / "dremio_platform_instance_to_file.yml" | ||||||
|  |     ).resolve() | ||||||
|  |     output_path = tmp_path / "dremio_mces.json" | ||||||
|  | 
 | ||||||
|  |     run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) | ||||||
|  | 
 | ||||||
|  |     with output_path.open() as f: | ||||||
|  |         content = f.read() | ||||||
|  |         # Skip if file is empty or just contains brackets | ||||||
|  |         if not content or content.strip() in ("[]", "[", "]"): | ||||||
|  |             pytest.fail(f"Output file is empty or invalid: {content}") | ||||||
|  | 
 | ||||||
|  |     try: | ||||||
|  |         # Try to load as JSON Lines first | ||||||
|  |         mces = [] | ||||||
|  |         for line in content.splitlines(): | ||||||
|  |             line = line.strip() | ||||||
|  |             if line and line not in ("[", "]"):  # Skip empty lines and bare brackets | ||||||
|  |                 mce = json.loads(line) | ||||||
|  |                 mces.append(mce) | ||||||
|  |     except json.JSONDecodeError: | ||||||
|  |         # If that fails, try loading as a single JSON array | ||||||
|  |         try: | ||||||
|  |             mces = json.loads(content) | ||||||
|  |         except json.JSONDecodeError as e: | ||||||
|  |             print(f"Failed to parse file content: {content}") | ||||||
|  |             raise e | ||||||
|  | 
 | ||||||
|  |     # Verify MCEs | ||||||
|  |     assert len(mces) > 0, "No MCEs found in output file" | ||||||
|  | 
 | ||||||
|  |     # Verify the platform instances | ||||||
|  |     for mce in mces: | ||||||
|  |         if "entityType" not in mce: | ||||||
|  |             continue | ||||||
|  | 
 | ||||||
|  |         # Check dataset URN structure | ||||||
|  |         if mce["entityType"] == "dataset" and "entityUrn" in mce: | ||||||
|  |             assert ( | ||||||
|  |                 "test-platform.dremio" in mce["entityUrn"] | ||||||
|  |             ), f"Platform instance missing in dataset URN: {mce['entityUrn']}" | ||||||
|  | 
 | ||||||
|  |         # Check aspects for both datasets and containers | ||||||
|  |         if "aspectName" in mce: | ||||||
|  |             # Check dataPlatformInstance aspect | ||||||
|  |             if mce["aspectName"] == "dataPlatformInstance": | ||||||
|  |                 aspect = mce["aspect"] | ||||||
|  |                 if not isinstance(aspect, Dict) or "json" not in aspect: | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |                 aspect_json = aspect["json"] | ||||||
|  |                 if not isinstance(aspect_json, Dict): | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |                 if "instance" not in aspect_json: | ||||||
|  |                     continue | ||||||
|  | 
 | ||||||
|  |                 instance = aspect_json["instance"] | ||||||
|  |                 expected_instance = "urn:li:dataPlatformInstance:(urn:li:dataPlatform:dremio,test-platform)" | ||||||
|  |                 assert ( | ||||||
|  |                     instance == expected_instance | ||||||
|  |                 ), f"Invalid platform instance format: {instance}" | ||||||
|  | 
 | ||||||
|  |     # Verify against golden file | ||||||
|  |     mce_helpers.check_golden_file( | ||||||
|  |         pytestconfig, | ||||||
|  |         output_path=output_path, | ||||||
|  |         golden_path=test_resources_dir / "dremio_platform_instance_mces_golden.json", | ||||||
|  |         ignore_paths=[], | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @freeze_time(FROZEN_TIME) | ||||||
|  | @pytest.mark.integration | ||||||
|  | def test_dremio_schema_filter( | ||||||
|  |     test_resources_dir, | ||||||
|  |     dremio_setup, | ||||||
|  |     pytestconfig, | ||||||
|  |     tmp_path, | ||||||
|  | ): | ||||||
|  |     config_file = (test_resources_dir / "dremio_schema_filter_to_file.yml").resolve() | ||||||
|  |     output_path = tmp_path / "dremio_mces.json" | ||||||
|  | 
 | ||||||
|  |     run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path) | ||||||
|  | 
 | ||||||
|  |     # Verify against golden file | ||||||
|  |     mce_helpers.check_golden_file( | ||||||
|  |         pytestconfig, | ||||||
|  |         output_path=output_path, | ||||||
|  |         golden_path=test_resources_dir / "dremio_schema_filter_mces_golden.json", | ||||||
|  |         ignore_paths=[], | ||||||
|  |     ) | ||||||
|  | |||||||
| @ -0,0 +1,123 @@ | |||||||
|  | from unittest.mock import Mock | ||||||
|  | 
 | ||||||
|  | import pytest | ||||||
|  | 
 | ||||||
|  | from datahub.ingestion.source.dremio.dremio_api import DremioAPIOperations | ||||||
|  | from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig | ||||||
|  | from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class TestDremioContainerFiltering: | ||||||
|  |     @pytest.fixture | ||||||
|  |     def dremio_api(self, monkeypatch): | ||||||
|  |         # Mock the requests.Session | ||||||
|  |         mock_session = Mock() | ||||||
|  |         monkeypatch.setattr("requests.Session", Mock(return_value=mock_session)) | ||||||
|  | 
 | ||||||
|  |         # Mock the authentication response | ||||||
|  |         mock_session.post.return_value.json.return_value = {"token": "dummy-token"} | ||||||
|  |         mock_session.post.return_value.status_code = 200 | ||||||
|  | 
 | ||||||
|  |         config = DremioSourceConfig( | ||||||
|  |             hostname="dummy-host", | ||||||
|  |             port=9047, | ||||||
|  |             tls=False, | ||||||
|  |             authentication_method="password", | ||||||
|  |             username="dummy-user", | ||||||
|  |             password="dummy-password", | ||||||
|  |             schema_pattern=dict(allow=[".*"], deny=[]), | ||||||
|  |         ) | ||||||
|  |         report = DremioSourceReport() | ||||||
|  |         return DremioAPIOperations(config, report) | ||||||
|  | 
 | ||||||
|  |     def test_basic_allow_pattern(self, dremio_api): | ||||||
|  |         """Test basic allow pattern matching""" | ||||||
|  |         dremio_api.allow_schema_pattern = ["test"] | ||||||
|  |         dremio_api.deny_schema_pattern = [] | ||||||
|  | 
 | ||||||
|  |         assert dremio_api.should_include_container([], "test") | ||||||
|  |         assert dremio_api.should_include_container(["test"], "subfolder") | ||||||
|  |         assert not dremio_api.should_include_container([], "prod_space") | ||||||
|  | 
 | ||||||
|  |     def test_basic_deny_pattern(self, dremio_api): | ||||||
|  |         """Test basic deny pattern matching""" | ||||||
|  |         dremio_api.allow_schema_pattern = [".*"] | ||||||
|  |         dremio_api.deny_schema_pattern = ["test_space.*"] | ||||||
|  | 
 | ||||||
|  |         assert not dremio_api.should_include_container([], "test_space") | ||||||
|  |         assert not dremio_api.should_include_container(["test_space"], "subfolder") | ||||||
|  |         assert dremio_api.should_include_container([], "prod_space") | ||||||
|  | 
 | ||||||
|  |     def test_hierarchical_matching(self, dremio_api): | ||||||
|  |         """Test matching with hierarchical paths""" | ||||||
|  |         dremio_api.allow_schema_pattern = ["prod.data.*"] | ||||||
|  |         dremio_api.deny_schema_pattern = [] | ||||||
|  | 
 | ||||||
|  |         assert dremio_api.should_include_container([], "prod") | ||||||
|  |         assert dremio_api.should_include_container(["prod"], "data") | ||||||
|  |         assert dremio_api.should_include_container(["prod", "data"], "sales") | ||||||
|  |         assert not dremio_api.should_include_container([], "dev") | ||||||
|  |         assert not dremio_api.should_include_container(["dev"], "data") | ||||||
|  | 
 | ||||||
|  |     def test_allow_and_deny_patterns(self, dremio_api): | ||||||
|  |         """Test combination of allow and deny patterns""" | ||||||
|  |         dremio_api.allow_schema_pattern = ["prod.*"] | ||||||
|  |         dremio_api.deny_schema_pattern = ["prod.internal.*"] | ||||||
|  | 
 | ||||||
|  |         assert dremio_api.should_include_container([], "prod") | ||||||
|  |         assert dremio_api.should_include_container(["prod"], "public") | ||||||
|  |         assert dremio_api.should_include_container(["prod", "public"], "next") | ||||||
|  |         assert not dremio_api.should_include_container(["prod"], "internal") | ||||||
|  |         assert not dremio_api.should_include_container(["prod", "internal"], "secrets") | ||||||
|  | 
 | ||||||
|  |     def test_wildcard_patterns(self, dremio_api): | ||||||
|  |         """Test wildcard pattern handling""" | ||||||
|  |         dremio_api.allow_schema_pattern = [".*"] | ||||||
|  |         dremio_api.deny_schema_pattern = [] | ||||||
|  | 
 | ||||||
|  |         assert dremio_api.should_include_container([], "any_space") | ||||||
|  |         assert dremio_api.should_include_container(["any_space"], "any_folder") | ||||||
|  | 
 | ||||||
|  |         # Test with specific wildcard in middle | ||||||
|  |         dremio_api.allow_schema_pattern = ["prod.*.public"] | ||||||
|  |         assert dremio_api.should_include_container(["prod", "customer"], "public") | ||||||
|  |         assert not dremio_api.should_include_container(["prod", "customer"], "private") | ||||||
|  | 
 | ||||||
|  |     def test_case_insensitive_matching(self, dremio_api): | ||||||
|  |         """Test case-insensitive pattern matching""" | ||||||
|  |         dremio_api.allow_schema_pattern = ["PROD.*"] | ||||||
|  |         dremio_api.deny_schema_pattern = [] | ||||||
|  | 
 | ||||||
|  |         assert dremio_api.should_include_container([], "prod") | ||||||
|  |         assert dremio_api.should_include_container([], "PROD") | ||||||
|  |         assert dremio_api.should_include_container(["prod"], "DATA") | ||||||
|  |         assert dremio_api.should_include_container(["PROD"], "data") | ||||||
|  | 
 | ||||||
|  |     def test_empty_patterns(self, dremio_api): | ||||||
|  |         """Test behavior with empty patterns""" | ||||||
|  |         dremio_api.allow_schema_pattern = [".*"] | ||||||
|  |         dremio_api.deny_schema_pattern = [] | ||||||
|  | 
 | ||||||
|  |         # Should allow everything when allow pattern is empty | ||||||
|  |         assert dremio_api.should_include_container([], "any_space") | ||||||
|  |         assert dremio_api.should_include_container(["any_space"], "any_folder") | ||||||
|  | 
 | ||||||
|  |     def test_partial_path_matching(self, dremio_api): | ||||||
|  |         """Test matching behavior with partial paths""" | ||||||
|  |         dremio_api.allow_schema_pattern = ["^pr.*.data.*"] | ||||||
|  |         dremio_api.deny_schema_pattern = [] | ||||||
|  | 
 | ||||||
|  |         assert dremio_api.should_include_container(["prod"], "data") | ||||||
|  |         # Should match the partial path even though pattern doesn't have wildcards | ||||||
|  |         assert dremio_api.should_include_container(["prod", "data"], "sales") | ||||||
|  |         assert not dremio_api.should_include_container([], "dev") | ||||||
|  |         assert not dremio_api.should_include_container(["dev", "data"], "sales") | ||||||
|  | 
 | ||||||
|  |     def test_partial_start_end_chars(self, dremio_api): | ||||||
|  |         """Test matching behavior with partial paths""" | ||||||
|  |         dremio_api.allow_schema_pattern = ["pr.*.data$"] | ||||||
|  |         dremio_api.deny_schema_pattern = [] | ||||||
|  | 
 | ||||||
|  |         assert dremio_api.should_include_container(["prod"], "data") | ||||||
|  |         # Should match the partial path even though pattern doesn't have wildcards | ||||||
|  |         assert not dremio_api.should_include_container(["prod", "data"], "sales") | ||||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user
	 Jonny Dixon
						Jonny Dixon