mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-03 22:33:25 +00:00
fix(ingestion/dremio): Ignore filtered containers in schema allowdeny pattern (#11959)
Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
This commit is contained in:
parent
7c1d3b09ed
commit
06edf23a33
@ -1,6 +1,7 @@
|
||||
import concurrent.futures
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import warnings
|
||||
from collections import defaultdict
|
||||
from enum import Enum
|
||||
@ -609,32 +610,6 @@ class DremioAPIOperations:
|
||||
|
||||
return self.execute_query(query=jobs_query)
|
||||
|
||||
def get_source_by_id(self, source_id: str) -> Optional[Dict]:
|
||||
"""
|
||||
Fetch source details by ID.
|
||||
"""
|
||||
response = self.get(
|
||||
url=f"/source/{source_id}",
|
||||
)
|
||||
return response if response else None
|
||||
|
||||
def get_source_for_dataset(self, schema: str, dataset: str) -> Optional[Dict]:
|
||||
"""
|
||||
Get source information for a dataset given its schema and name.
|
||||
"""
|
||||
dataset_id = self.get_dataset_id(schema, dataset)
|
||||
if not dataset_id:
|
||||
return None
|
||||
|
||||
catalog_entry = self.get(
|
||||
url=f"/catalog/{dataset_id}",
|
||||
)
|
||||
if not catalog_entry or "path" not in catalog_entry:
|
||||
return None
|
||||
|
||||
source_id = catalog_entry["path"][0]
|
||||
return self.get_source_by_id(source_id)
|
||||
|
||||
def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
|
||||
"""
|
||||
Get Dremio tags for a given resource_id.
|
||||
@ -673,55 +648,119 @@ class DremioAPIOperations:
|
||||
)
|
||||
return None
|
||||
|
||||
def get_containers_for_location(
|
||||
self, resource_id: str, path: List[str]
|
||||
) -> List[Dict[str, str]]:
|
||||
containers = []
|
||||
def _check_pattern_match(
|
||||
self,
|
||||
pattern: str,
|
||||
paths: List[str],
|
||||
allow_prefix: bool = True,
|
||||
) -> bool:
|
||||
"""
|
||||
Helper method to check if a pattern matches any of the paths.
|
||||
Handles hierarchical matching where each level is matched independently.
|
||||
Also handles prefix matching for partial paths.
|
||||
"""
|
||||
if pattern == ".*":
|
||||
return True
|
||||
|
||||
def traverse_path(location_id: str, entity_path: List[str]) -> List:
|
||||
nonlocal containers
|
||||
try:
|
||||
response = self.get(url=f"/catalog/{location_id}")
|
||||
if (
|
||||
response.get("entityType")
|
||||
== DremioEntityContainerType.FOLDER.value.lower()
|
||||
# Convert the pattern to regex with proper anchoring
|
||||
regex_pattern = pattern
|
||||
if pattern.startswith("^"):
|
||||
# Already has start anchor
|
||||
regex_pattern = pattern.replace(".", r"\.") # Escape dots
|
||||
regex_pattern = regex_pattern.replace(
|
||||
r"\.*", ".*"
|
||||
) # Convert .* to wildcard
|
||||
else:
|
||||
# Add start anchor and handle dots
|
||||
regex_pattern = "^" + pattern.replace(".", r"\.").replace(r"\.*", ".*")
|
||||
|
||||
# Handle end matching
|
||||
if not pattern.endswith(".*"):
|
||||
if pattern.endswith("$"):
|
||||
# Keep explicit end anchor
|
||||
pass
|
||||
elif not allow_prefix:
|
||||
# Add end anchor for exact matching
|
||||
regex_pattern = regex_pattern + "$"
|
||||
|
||||
for path in paths:
|
||||
if re.match(regex_pattern, path, re.IGNORECASE):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
def should_include_container(self, path: List[str], name: str) -> bool:
|
||||
"""
|
||||
Helper method to check if a container should be included based on schema patterns.
|
||||
Used by both get_all_containers and get_containers_for_location.
|
||||
"""
|
||||
path_components = path + [name] if path else [name]
|
||||
full_path = ".".join(path_components)
|
||||
|
||||
# Default allow everything case
|
||||
if self.allow_schema_pattern == [".*"] and not self.deny_schema_pattern:
|
||||
self.report.report_container_scanned(full_path)
|
||||
return True
|
||||
|
||||
# Check deny patterns first
|
||||
if self.deny_schema_pattern:
|
||||
for pattern in self.deny_schema_pattern:
|
||||
if self._check_pattern_match(
|
||||
pattern=pattern,
|
||||
paths=[full_path],
|
||||
allow_prefix=False,
|
||||
):
|
||||
containers.append(
|
||||
{
|
||||
"id": location_id,
|
||||
"name": entity_path[-1],
|
||||
"path": entity_path[:-1],
|
||||
"container_type": DremioEntityContainerType.FOLDER,
|
||||
}
|
||||
)
|
||||
self.report.report_container_filtered(full_path)
|
||||
return False
|
||||
|
||||
for container in response.get("children", []):
|
||||
if (
|
||||
container.get("type")
|
||||
== DremioEntityContainerType.CONTAINER.value
|
||||
# Check allow patterns
|
||||
for pattern in self.allow_schema_pattern:
|
||||
# For patterns with wildcards, check if this path is a parent of the pattern
|
||||
if "*" in pattern:
|
||||
pattern_parts = pattern.split(".")
|
||||
path_parts = path_components
|
||||
|
||||
# If pattern has exact same number of parts, check each component
|
||||
if len(pattern_parts) == len(path_parts):
|
||||
matches = True
|
||||
for p_part, c_part in zip(pattern_parts, path_parts):
|
||||
if p_part != "*" and p_part.lower() != c_part.lower():
|
||||
matches = False
|
||||
break
|
||||
if matches:
|
||||
self.report.report_container_scanned(full_path)
|
||||
return True
|
||||
# Otherwise check if current path is prefix match
|
||||
else:
|
||||
# Remove the trailing wildcard if present
|
||||
if pattern_parts[-1] == "*":
|
||||
pattern_parts = pattern_parts[:-1]
|
||||
|
||||
for i in range(len(path_parts)):
|
||||
current_path = ".".join(path_parts[: i + 1])
|
||||
pattern_prefix = ".".join(pattern_parts[: i + 1])
|
||||
|
||||
if pattern_prefix.startswith(current_path):
|
||||
self.report.report_container_scanned(full_path)
|
||||
return True
|
||||
|
||||
# Direct pattern matching
|
||||
if self._check_pattern_match(
|
||||
pattern=pattern,
|
||||
paths=[full_path],
|
||||
allow_prefix=True,
|
||||
):
|
||||
traverse_path(container.get("id"), container.get("path"))
|
||||
self.report.report_container_scanned(full_path)
|
||||
return True
|
||||
|
||||
except Exception as exc:
|
||||
logging.info(
|
||||
"Location {} contains no tables or views. Skipping...".format(id)
|
||||
)
|
||||
self.report.warning(
|
||||
message="Failed to get tables or views",
|
||||
context=f"{id}",
|
||||
exc=exc,
|
||||
)
|
||||
|
||||
return containers
|
||||
|
||||
return traverse_path(location_id=resource_id, entity_path=path)
|
||||
self.report.report_container_filtered(full_path)
|
||||
return False
|
||||
|
||||
def get_all_containers(self):
|
||||
"""
|
||||
Query the Dremio sources API and return source information.
|
||||
Query the Dremio sources API and return filtered source information.
|
||||
"""
|
||||
containers = []
|
||||
|
||||
response = self.get(url="/catalog")
|
||||
|
||||
def process_source(source):
|
||||
@ -731,11 +770,11 @@ class DremioAPIOperations:
|
||||
)
|
||||
|
||||
source_config = source_resp.get("config", {})
|
||||
if source_config.get("database"):
|
||||
db = source_config.get("database")
|
||||
else:
|
||||
db = source_config.get("databaseName", "")
|
||||
db = source_config.get(
|
||||
"database", source_config.get("databaseName", "")
|
||||
)
|
||||
|
||||
if self.should_include_container([], source.get("path")[0]):
|
||||
return {
|
||||
"id": source.get("id"),
|
||||
"name": source.get("path")[0],
|
||||
@ -746,19 +785,26 @@ class DremioAPIOperations:
|
||||
"database_name": db,
|
||||
}
|
||||
else:
|
||||
if self.should_include_container([], source.get("path")[0]):
|
||||
return {
|
||||
"id": source.get("id"),
|
||||
"name": source.get("path")[0],
|
||||
"path": [],
|
||||
"container_type": DremioEntityContainerType.SPACE,
|
||||
}
|
||||
return None
|
||||
|
||||
def process_source_and_containers(source):
|
||||
container = process_source(source)
|
||||
if not container:
|
||||
return []
|
||||
|
||||
# Get sub-containers
|
||||
sub_containers = self.get_containers_for_location(
|
||||
resource_id=container.get("id"),
|
||||
path=[container.get("name")],
|
||||
)
|
||||
|
||||
return [container] + sub_containers
|
||||
|
||||
# Use ThreadPoolExecutor to parallelize the processing of sources
|
||||
@ -771,7 +817,16 @@ class DremioAPIOperations:
|
||||
}
|
||||
|
||||
for future in concurrent.futures.as_completed(future_to_source):
|
||||
source = future_to_source[future]
|
||||
try:
|
||||
containers.extend(future.result())
|
||||
except Exception as exc:
|
||||
logger.error(f"Error processing source: {exc}")
|
||||
self.report.warning(
|
||||
message="Failed to process source",
|
||||
context=f"{source}",
|
||||
exc=exc,
|
||||
)
|
||||
|
||||
return containers
|
||||
|
||||
@ -785,3 +840,55 @@ class DremioAPIOperations:
|
||||
)
|
||||
else:
|
||||
return ""
|
||||
|
||||
def get_containers_for_location(
|
||||
self, resource_id: str, path: List[str]
|
||||
) -> List[Dict[str, str]]:
|
||||
containers = []
|
||||
|
||||
def traverse_path(location_id: str, entity_path: List[str]) -> List:
|
||||
nonlocal containers
|
||||
try:
|
||||
response = self.get(url=f"/catalog/{location_id}")
|
||||
|
||||
# Check if current folder should be included
|
||||
if (
|
||||
response.get("entityType")
|
||||
== DremioEntityContainerType.FOLDER.value.lower()
|
||||
):
|
||||
folder_name = entity_path[-1]
|
||||
folder_path = entity_path[:-1]
|
||||
|
||||
if self.should_include_container(folder_path, folder_name):
|
||||
containers.append(
|
||||
{
|
||||
"id": location_id,
|
||||
"name": folder_name,
|
||||
"path": folder_path,
|
||||
"container_type": DremioEntityContainerType.FOLDER,
|
||||
}
|
||||
)
|
||||
|
||||
# Recursively process child containers
|
||||
for container in response.get("children", []):
|
||||
if (
|
||||
container.get("type")
|
||||
== DremioEntityContainerType.CONTAINER.value
|
||||
):
|
||||
traverse_path(container.get("id"), container.get("path"))
|
||||
|
||||
except Exception as exc:
|
||||
logging.info(
|
||||
"Location {} contains no tables or views. Skipping...".format(
|
||||
location_id
|
||||
)
|
||||
)
|
||||
self.report.warning(
|
||||
message="Failed to get tables or views",
|
||||
context=f"{location_id}",
|
||||
exc=exc,
|
||||
)
|
||||
|
||||
return containers
|
||||
|
||||
return traverse_path(location_id=resource_id, entity_path=path)
|
||||
|
@ -31,6 +31,7 @@ class DremioToDataHubSourceTypeMapping:
|
||||
"SNOWFLAKE": "snowflake",
|
||||
"SYNAPSE": "mssql",
|
||||
"TERADATA": "teradata",
|
||||
"VERTICA": "vertica",
|
||||
}
|
||||
|
||||
DATABASE_SOURCE_TYPES = {
|
||||
@ -52,6 +53,7 @@ class DremioToDataHubSourceTypeMapping:
|
||||
"SNOWFLAKE",
|
||||
"SYNAPSE",
|
||||
"TERADATA",
|
||||
"VERTICA",
|
||||
}
|
||||
|
||||
FILE_OBJECT_STORAGE_TYPES = {
|
||||
|
@ -14,12 +14,27 @@ class DremioSourceReport(
|
||||
):
|
||||
num_containers_failed: int = 0
|
||||
num_datasets_failed: int = 0
|
||||
containers_scanned: int = 0
|
||||
containers_filtered: int = 0
|
||||
|
||||
def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
|
||||
# recording total combined latency is not very useful, keeping this method as a placeholder
|
||||
# for future implementation of min / max / percentiles etc.
|
||||
pass
|
||||
|
||||
def report_container_scanned(self, name: str) -> None:
|
||||
"""
|
||||
Record that a container was successfully scanned
|
||||
"""
|
||||
self.containers_scanned += 1
|
||||
|
||||
def report_container_filtered(self, container_name: str) -> None:
|
||||
"""
|
||||
Record that a container was filtered out
|
||||
"""
|
||||
self.containers_filtered += 1
|
||||
self.report_dropped(container_name)
|
||||
|
||||
def report_entity_scanned(self, name: str, ent_type: str = "View") -> None:
|
||||
"""
|
||||
Entity could be a view or a table
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,26 @@
|
||||
source:
|
||||
type: dremio
|
||||
config:
|
||||
# Coordinates
|
||||
hostname: localhost
|
||||
port: 9047
|
||||
tls: false
|
||||
|
||||
# Credentials
|
||||
authentication_method: password
|
||||
username: admin
|
||||
password: "2310Admin1234!@"
|
||||
|
||||
platform_instance: test-platform
|
||||
|
||||
include_query_lineage: false
|
||||
|
||||
source_mappings:
|
||||
- platform: s3
|
||||
source_name: samples
|
||||
platform_instance: s3_test_samples
|
||||
|
||||
sink:
|
||||
type: file
|
||||
config:
|
||||
filename: "./dremio_mces.json"
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,28 @@
|
||||
source:
|
||||
type: dremio
|
||||
config:
|
||||
# Coordinates
|
||||
hostname: localhost
|
||||
port: 9047
|
||||
tls: false
|
||||
|
||||
# Credentials
|
||||
authentication_method: password
|
||||
username: admin
|
||||
password: "2310Admin1234!@"
|
||||
|
||||
include_query_lineage: false
|
||||
|
||||
source_mappings:
|
||||
- platform: s3
|
||||
source_name: samples
|
||||
platform_instance: s3_test_samples
|
||||
|
||||
schema_pattern:
|
||||
allow:
|
||||
- "Samples"
|
||||
|
||||
sink:
|
||||
type: file
|
||||
config:
|
||||
filename: "./dremio_mces.json"
|
@ -1,6 +1,7 @@
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from typing import Dict
|
||||
|
||||
import boto3
|
||||
import pytest
|
||||
@ -75,9 +76,10 @@ def create_spaces_and_folders(headers):
|
||||
|
||||
|
||||
def create_sample_source(headers):
|
||||
url = f"{DREMIO_HOST}/apiv2/source/Samples"
|
||||
url = f"{DREMIO_HOST}/api/v3/catalog"
|
||||
|
||||
payload = {
|
||||
"entityType": "source",
|
||||
"config": {
|
||||
"externalBucketList": ["samples.dremio.com"],
|
||||
"credentialType": "NONE",
|
||||
@ -95,14 +97,15 @@ def create_sample_source(headers):
|
||||
"type": "S3",
|
||||
}
|
||||
|
||||
response = requests.put(url, headers=headers, data=json.dumps(payload))
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
assert response.status_code == 200, f"Failed to add dataset: {response.text}"
|
||||
|
||||
|
||||
def create_s3_source(headers):
|
||||
url = f"{DREMIO_HOST}/apiv2/source/s3"
|
||||
url = f"{DREMIO_HOST}/api/v3/catalog"
|
||||
|
||||
payload = {
|
||||
"entityType": "source",
|
||||
"name": "s3",
|
||||
"config": {
|
||||
"credentialType": "ACCESS_KEY",
|
||||
@ -139,24 +142,25 @@ def create_s3_source(headers):
|
||||
"metadataPolicy": {
|
||||
"deleteUnavailableDatasets": True,
|
||||
"autoPromoteDatasets": False,
|
||||
"namesRefreshMillis": 3600000,
|
||||
"datasetDefinitionRefreshAfterMillis": 3600000,
|
||||
"datasetDefinitionExpireAfterMillis": 10800000,
|
||||
"authTTLMillis": 86400000,
|
||||
"updateMode": "PREFETCH_QUERIED",
|
||||
"namesRefreshMs": 3600000,
|
||||
"datasetRefreshAfterMs": 3600000,
|
||||
"datasetExpireAfterMs": 10800000,
|
||||
"authTTLMs": 86400000,
|
||||
"datasetUpdateMode": "PREFETCH_QUERIED",
|
||||
},
|
||||
"type": "S3",
|
||||
"accessControlList": {"userControls": [], "roleControls": []},
|
||||
}
|
||||
|
||||
response = requests.put(url, headers=headers, data=json.dumps(payload))
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
assert response.status_code == 200, f"Failed to add s3 datasource: {response.text}"
|
||||
|
||||
|
||||
def create_mysql_source(headers):
|
||||
url = f"{DREMIO_HOST}/apiv2/source/mysql"
|
||||
url = f"{DREMIO_HOST}/api/v3/catalog"
|
||||
|
||||
payload = {
|
||||
"entityType": "source",
|
||||
"config": {
|
||||
"username": "root",
|
||||
"password": "rootpwd123",
|
||||
@ -169,7 +173,7 @@ def create_mysql_source(headers):
|
||||
"maxIdleConns": 8,
|
||||
"idleTimeSec": 60,
|
||||
},
|
||||
"name": "mysql-source",
|
||||
"name": "mysql",
|
||||
"accelerationRefreshPeriod": 3600000,
|
||||
"accelerationGracePeriod": 10800000,
|
||||
"accelerationActivePolicyType": "PERIOD",
|
||||
@ -177,30 +181,46 @@ def create_mysql_source(headers):
|
||||
"accelerationRefreshOnDataChanges": False,
|
||||
"metadataPolicy": {
|
||||
"deleteUnavailableDatasets": True,
|
||||
"namesRefreshMillis": 3600000,
|
||||
"datasetDefinitionRefreshAfterMillis": 3600000,
|
||||
"datasetDefinitionExpireAfterMillis": 10800000,
|
||||
"authTTLMillis": 86400000,
|
||||
"updateMode": "PREFETCH_QUERIED",
|
||||
"namesRefreshMs": 3600000,
|
||||
"datasetRefreshAfterMs": 3600000,
|
||||
"datasetExpireAfterMs": 10800000,
|
||||
"authTTLMs": 86400000,
|
||||
"datasetUpdateMode": "PREFETCH_QUERIED",
|
||||
},
|
||||
"type": "MYSQL",
|
||||
}
|
||||
response = requests.put(url, headers=headers, data=json.dumps(payload))
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
assert (
|
||||
response.status_code == 200
|
||||
), f"Failed to add mysql datasource: {response.text}"
|
||||
|
||||
|
||||
def upload_dataset(headers):
|
||||
url = f"{DREMIO_HOST}/apiv2/source/s3/file_format/warehouse/sample.parquet"
|
||||
payload = {"ignoreOtherFileFormats": False, "type": "Parquet"}
|
||||
url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2Fs3%2Fwarehouse"
|
||||
payload = {
|
||||
"entityType": "dataset",
|
||||
"type": "PHYSICAL_DATASET",
|
||||
"path": [
|
||||
"s3",
|
||||
"warehouse",
|
||||
],
|
||||
"format": {"type": "Parquet"},
|
||||
}
|
||||
|
||||
response = requests.put(url, headers=headers, data=json.dumps(payload))
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
assert response.status_code == 200, f"Failed to add dataset: {response.text}"
|
||||
|
||||
url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/NYC-weather.csv"
|
||||
url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2FNYC-weather.csv"
|
||||
|
||||
payload = {
|
||||
"entityType": "dataset",
|
||||
"type": "PHYSICAL_DATASET",
|
||||
"path": [
|
||||
"Samples",
|
||||
"samples.dremio.com",
|
||||
"NYC-weather.csv",
|
||||
],
|
||||
"format": {
|
||||
"fieldDelimiter": ",",
|
||||
"quote": '"',
|
||||
"comment": "#",
|
||||
@ -210,21 +230,41 @@ def upload_dataset(headers):
|
||||
"trimHeader": True,
|
||||
"skipFirstLine": False,
|
||||
"type": "Text",
|
||||
},
|
||||
}
|
||||
|
||||
response = requests.put(url, headers=headers, data=json.dumps(payload))
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
assert response.status_code == 200, f"Failed to add dataset: {response.text}"
|
||||
|
||||
url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/Dremio%20University/oracle-departments.xlsx"
|
||||
|
||||
payload = {"extractHeader": True, "hasMergedCells": False, "type": "Excel"}
|
||||
|
||||
response = requests.put(url, headers=headers, data=json.dumps(payload))
|
||||
assert response.status_code == 200, f"Failed to add dataset: {response.text}"
|
||||
|
||||
url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/Dremio%20University/googleplaystore.csv"
|
||||
url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2FDremio%20University%2Foracle-departments.xlsx"
|
||||
|
||||
payload = {
|
||||
"entityType": "dataset",
|
||||
"type": "PHYSICAL_DATASET",
|
||||
"path": [
|
||||
"Samples",
|
||||
"samples.dremio.com",
|
||||
"Dremio University",
|
||||
"oracle-departments.xlsx",
|
||||
],
|
||||
"format": {"extractHeader": True, "hasMergedCells": False, "type": "Excel"},
|
||||
}
|
||||
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
assert response.status_code == 200, f"Failed to add dataset: {response.text}"
|
||||
|
||||
url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2FDremio%20University%2Fgoogleplaystore.csv"
|
||||
|
||||
payload = {
|
||||
"entityType": "dataset",
|
||||
"type": "PHYSICAL_DATASET",
|
||||
"path": [
|
||||
"Samples",
|
||||
"samples.dremio.com",
|
||||
"Dremio University",
|
||||
"googleplaystore.csv",
|
||||
],
|
||||
"format": {
|
||||
"fieldDelimiter": ",",
|
||||
"quote": '"',
|
||||
"comment": "#",
|
||||
@ -234,15 +274,28 @@ def upload_dataset(headers):
|
||||
"trimHeader": True,
|
||||
"skipFirstLine": False,
|
||||
"type": "Text",
|
||||
},
|
||||
}
|
||||
|
||||
response = requests.put(url, headers=headers, data=json.dumps(payload))
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
assert response.status_code == 200, f"Failed to add dataset: {response.text}"
|
||||
|
||||
url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/tpcds_sf1000/catalog_page/1ab266d5-18eb-4780-711d-0fa337fa6c00/0_0_0.parquet"
|
||||
payload = {"ignoreOtherFileFormats": False, "type": "Parquet"}
|
||||
url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2Ftpcds_sf1000%2Fcatalog_page%2F1ab266d5-18eb-4780-711d-0fa337fa6c00%2F0_0_0.parquet"
|
||||
payload = {
|
||||
"entityType": "dataset",
|
||||
"type": "PHYSICAL_DATASET",
|
||||
"path": [
|
||||
"Samples",
|
||||
"samples.dremio.com",
|
||||
"tpcds_sf1000",
|
||||
"catalog_page",
|
||||
"1ab266d5-18eb-4780-711d-0fa337fa6c00",
|
||||
"0_0_0.parquet",
|
||||
],
|
||||
"format": {"type": "Parquet"},
|
||||
}
|
||||
|
||||
response = requests.put(url, headers=headers, data=json.dumps(payload))
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
assert response.status_code == 200, f"Failed to add dataset: {response.text}"
|
||||
|
||||
|
||||
@ -253,7 +306,7 @@ def create_view(headers):
|
||||
"entityType": "dataset",
|
||||
"type": "VIRTUAL_DATASET",
|
||||
"path": ["space", "test_folder", "raw"],
|
||||
"sql": 'SELECT * FROM s3.warehouse."sample.parquet"',
|
||||
"sql": "SELECT * FROM s3.warehouse",
|
||||
}
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
assert response.status_code == 200, f"Failed to create view: {response.text}"
|
||||
@ -273,7 +326,7 @@ def create_view(headers):
|
||||
"entityType": "dataset",
|
||||
"type": "VIRTUAL_DATASET",
|
||||
"path": ["space", "test_folder", "customers"],
|
||||
"sql": 'SELECT * FROM "mysql".northwind.customers',
|
||||
"sql": "SELECT * FROM mysql.northwind.customers",
|
||||
"sqlContext": ["mysql", "northwind"],
|
||||
}
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
@ -283,7 +336,7 @@ def create_view(headers):
|
||||
"entityType": "dataset",
|
||||
"type": "VIRTUAL_DATASET",
|
||||
"path": ["space", "test_folder", "orders"],
|
||||
"sql": 'SELECT * FROM "mysql".northwind.orders',
|
||||
"sql": "SELECT * FROM mysql.northwind.orders",
|
||||
"sqlContext": ["mysql", "northwind"],
|
||||
}
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
@ -293,7 +346,7 @@ def create_view(headers):
|
||||
"entityType": "dataset",
|
||||
"type": "VIRTUAL_DATASET",
|
||||
"path": ["space", "test_folder", "metadata_aspect"],
|
||||
"sql": 'SELECT * FROM "mysql".metagalaxy."metadata_aspect"',
|
||||
"sql": "SELECT * FROM mysql.metagalaxy.metadata_aspect",
|
||||
"sqlContext": ["mysql", "metagalaxy"],
|
||||
}
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
@ -303,7 +356,7 @@ def create_view(headers):
|
||||
"entityType": "dataset",
|
||||
"type": "VIRTUAL_DATASET",
|
||||
"path": ["space", "test_folder", "metadata_index"],
|
||||
"sql": 'SELECT * FROM "mysql".metagalaxy."metadata_index"',
|
||||
"sql": "SELECT * FROM mysql.metagalaxy.metadata_index",
|
||||
"sqlContext": ["mysql", "metagalaxy"],
|
||||
}
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
@ -313,7 +366,7 @@ def create_view(headers):
|
||||
"entityType": "dataset",
|
||||
"type": "VIRTUAL_DATASET",
|
||||
"path": ["space", "test_folder", "metadata_index_view"],
|
||||
"sql": 'SELECT * FROM "mysql".metagalaxy."metadata_index_view"',
|
||||
"sql": "SELECT * FROM mysql.metagalaxy.metadata_index_view",
|
||||
"sqlContext": ["mysql", "metagalaxy"],
|
||||
}
|
||||
response = requests.post(url, headers=headers, data=json.dumps(payload))
|
||||
@ -422,14 +475,119 @@ def test_dremio_ingest(
|
||||
pytestconfig,
|
||||
tmp_path,
|
||||
):
|
||||
# Run the metadata ingestion pipeline.
|
||||
# Run the metadata ingestion pipeline with specific output file
|
||||
config_file = (test_resources_dir / "dremio_to_file.yml").resolve()
|
||||
output_path = tmp_path / "dremio_mces.json"
|
||||
|
||||
run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)
|
||||
|
||||
# Verify the output.
|
||||
# Verify the output
|
||||
mce_helpers.check_golden_file(
|
||||
pytestconfig,
|
||||
output_path=tmp_path / "dremio_mces.json",
|
||||
output_path=output_path,
|
||||
golden_path=test_resources_dir / "dremio_mces_golden.json",
|
||||
ignore_paths=[],
|
||||
)
|
||||
|
||||
|
||||
@freeze_time(FROZEN_TIME)
|
||||
@pytest.mark.integration
|
||||
def test_dremio_platform_instance_urns(
|
||||
test_resources_dir,
|
||||
dremio_setup,
|
||||
pytestconfig,
|
||||
tmp_path,
|
||||
):
|
||||
config_file = (
|
||||
test_resources_dir / "dremio_platform_instance_to_file.yml"
|
||||
).resolve()
|
||||
output_path = tmp_path / "dremio_mces.json"
|
||||
|
||||
run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)
|
||||
|
||||
with output_path.open() as f:
|
||||
content = f.read()
|
||||
# Skip if file is empty or just contains brackets
|
||||
if not content or content.strip() in ("[]", "[", "]"):
|
||||
pytest.fail(f"Output file is empty or invalid: {content}")
|
||||
|
||||
try:
|
||||
# Try to load as JSON Lines first
|
||||
mces = []
|
||||
for line in content.splitlines():
|
||||
line = line.strip()
|
||||
if line and line not in ("[", "]"): # Skip empty lines and bare brackets
|
||||
mce = json.loads(line)
|
||||
mces.append(mce)
|
||||
except json.JSONDecodeError:
|
||||
# If that fails, try loading as a single JSON array
|
||||
try:
|
||||
mces = json.loads(content)
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"Failed to parse file content: {content}")
|
||||
raise e
|
||||
|
||||
# Verify MCEs
|
||||
assert len(mces) > 0, "No MCEs found in output file"
|
||||
|
||||
# Verify the platform instances
|
||||
for mce in mces:
|
||||
if "entityType" not in mce:
|
||||
continue
|
||||
|
||||
# Check dataset URN structure
|
||||
if mce["entityType"] == "dataset" and "entityUrn" in mce:
|
||||
assert (
|
||||
"test-platform.dremio" in mce["entityUrn"]
|
||||
), f"Platform instance missing in dataset URN: {mce['entityUrn']}"
|
||||
|
||||
# Check aspects for both datasets and containers
|
||||
if "aspectName" in mce:
|
||||
# Check dataPlatformInstance aspect
|
||||
if mce["aspectName"] == "dataPlatformInstance":
|
||||
aspect = mce["aspect"]
|
||||
if not isinstance(aspect, Dict) or "json" not in aspect:
|
||||
continue
|
||||
|
||||
aspect_json = aspect["json"]
|
||||
if not isinstance(aspect_json, Dict):
|
||||
continue
|
||||
|
||||
if "instance" not in aspect_json:
|
||||
continue
|
||||
|
||||
instance = aspect_json["instance"]
|
||||
expected_instance = "urn:li:dataPlatformInstance:(urn:li:dataPlatform:dremio,test-platform)"
|
||||
assert (
|
||||
instance == expected_instance
|
||||
), f"Invalid platform instance format: {instance}"
|
||||
|
||||
# Verify against golden file
|
||||
mce_helpers.check_golden_file(
|
||||
pytestconfig,
|
||||
output_path=output_path,
|
||||
golden_path=test_resources_dir / "dremio_platform_instance_mces_golden.json",
|
||||
ignore_paths=[],
|
||||
)
|
||||
|
||||
|
||||
@freeze_time(FROZEN_TIME)
|
||||
@pytest.mark.integration
|
||||
def test_dremio_schema_filter(
|
||||
test_resources_dir,
|
||||
dremio_setup,
|
||||
pytestconfig,
|
||||
tmp_path,
|
||||
):
|
||||
config_file = (test_resources_dir / "dremio_schema_filter_to_file.yml").resolve()
|
||||
output_path = tmp_path / "dremio_mces.json"
|
||||
|
||||
run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)
|
||||
|
||||
# Verify against golden file
|
||||
mce_helpers.check_golden_file(
|
||||
pytestconfig,
|
||||
output_path=output_path,
|
||||
golden_path=test_resources_dir / "dremio_schema_filter_mces_golden.json",
|
||||
ignore_paths=[],
|
||||
)
|
||||
|
@ -0,0 +1,123 @@
|
||||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from datahub.ingestion.source.dremio.dremio_api import DremioAPIOperations
|
||||
from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
|
||||
from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
|
||||
|
||||
|
||||
class TestDremioContainerFiltering:
|
||||
@pytest.fixture
|
||||
def dremio_api(self, monkeypatch):
|
||||
# Mock the requests.Session
|
||||
mock_session = Mock()
|
||||
monkeypatch.setattr("requests.Session", Mock(return_value=mock_session))
|
||||
|
||||
# Mock the authentication response
|
||||
mock_session.post.return_value.json.return_value = {"token": "dummy-token"}
|
||||
mock_session.post.return_value.status_code = 200
|
||||
|
||||
config = DremioSourceConfig(
|
||||
hostname="dummy-host",
|
||||
port=9047,
|
||||
tls=False,
|
||||
authentication_method="password",
|
||||
username="dummy-user",
|
||||
password="dummy-password",
|
||||
schema_pattern=dict(allow=[".*"], deny=[]),
|
||||
)
|
||||
report = DremioSourceReport()
|
||||
return DremioAPIOperations(config, report)
|
||||
|
||||
def test_basic_allow_pattern(self, dremio_api):
|
||||
"""Test basic allow pattern matching"""
|
||||
dremio_api.allow_schema_pattern = ["test"]
|
||||
dremio_api.deny_schema_pattern = []
|
||||
|
||||
assert dremio_api.should_include_container([], "test")
|
||||
assert dremio_api.should_include_container(["test"], "subfolder")
|
||||
assert not dremio_api.should_include_container([], "prod_space")
|
||||
|
||||
def test_basic_deny_pattern(self, dremio_api):
|
||||
"""Test basic deny pattern matching"""
|
||||
dremio_api.allow_schema_pattern = [".*"]
|
||||
dremio_api.deny_schema_pattern = ["test_space.*"]
|
||||
|
||||
assert not dremio_api.should_include_container([], "test_space")
|
||||
assert not dremio_api.should_include_container(["test_space"], "subfolder")
|
||||
assert dremio_api.should_include_container([], "prod_space")
|
||||
|
||||
def test_hierarchical_matching(self, dremio_api):
|
||||
"""Test matching with hierarchical paths"""
|
||||
dremio_api.allow_schema_pattern = ["prod.data.*"]
|
||||
dremio_api.deny_schema_pattern = []
|
||||
|
||||
assert dremio_api.should_include_container([], "prod")
|
||||
assert dremio_api.should_include_container(["prod"], "data")
|
||||
assert dremio_api.should_include_container(["prod", "data"], "sales")
|
||||
assert not dremio_api.should_include_container([], "dev")
|
||||
assert not dremio_api.should_include_container(["dev"], "data")
|
||||
|
||||
def test_allow_and_deny_patterns(self, dremio_api):
|
||||
"""Test combination of allow and deny patterns"""
|
||||
dremio_api.allow_schema_pattern = ["prod.*"]
|
||||
dremio_api.deny_schema_pattern = ["prod.internal.*"]
|
||||
|
||||
assert dremio_api.should_include_container([], "prod")
|
||||
assert dremio_api.should_include_container(["prod"], "public")
|
||||
assert dremio_api.should_include_container(["prod", "public"], "next")
|
||||
assert not dremio_api.should_include_container(["prod"], "internal")
|
||||
assert not dremio_api.should_include_container(["prod", "internal"], "secrets")
|
||||
|
||||
def test_wildcard_patterns(self, dremio_api):
|
||||
"""Test wildcard pattern handling"""
|
||||
dremio_api.allow_schema_pattern = [".*"]
|
||||
dremio_api.deny_schema_pattern = []
|
||||
|
||||
assert dremio_api.should_include_container([], "any_space")
|
||||
assert dremio_api.should_include_container(["any_space"], "any_folder")
|
||||
|
||||
# Test with specific wildcard in middle
|
||||
dremio_api.allow_schema_pattern = ["prod.*.public"]
|
||||
assert dremio_api.should_include_container(["prod", "customer"], "public")
|
||||
assert not dremio_api.should_include_container(["prod", "customer"], "private")
|
||||
|
||||
def test_case_insensitive_matching(self, dremio_api):
|
||||
"""Test case-insensitive pattern matching"""
|
||||
dremio_api.allow_schema_pattern = ["PROD.*"]
|
||||
dremio_api.deny_schema_pattern = []
|
||||
|
||||
assert dremio_api.should_include_container([], "prod")
|
||||
assert dremio_api.should_include_container([], "PROD")
|
||||
assert dremio_api.should_include_container(["prod"], "DATA")
|
||||
assert dremio_api.should_include_container(["PROD"], "data")
|
||||
|
||||
def test_empty_patterns(self, dremio_api):
|
||||
"""Test behavior with empty patterns"""
|
||||
dremio_api.allow_schema_pattern = [".*"]
|
||||
dremio_api.deny_schema_pattern = []
|
||||
|
||||
# Should allow everything when allow pattern is empty
|
||||
assert dremio_api.should_include_container([], "any_space")
|
||||
assert dremio_api.should_include_container(["any_space"], "any_folder")
|
||||
|
||||
def test_partial_path_matching(self, dremio_api):
|
||||
"""Test matching behavior with partial paths"""
|
||||
dremio_api.allow_schema_pattern = ["^pr.*.data.*"]
|
||||
dremio_api.deny_schema_pattern = []
|
||||
|
||||
assert dremio_api.should_include_container(["prod"], "data")
|
||||
# Should match the partial path even though pattern doesn't have wildcards
|
||||
assert dremio_api.should_include_container(["prod", "data"], "sales")
|
||||
assert not dremio_api.should_include_container([], "dev")
|
||||
assert not dremio_api.should_include_container(["dev", "data"], "sales")
|
||||
|
||||
def test_partial_start_end_chars(self, dremio_api):
|
||||
"""Test matching behavior with partial paths"""
|
||||
dremio_api.allow_schema_pattern = ["pr.*.data$"]
|
||||
dremio_api.deny_schema_pattern = []
|
||||
|
||||
assert dremio_api.should_include_container(["prod"], "data")
|
||||
# Should match the partial path even though pattern doesn't have wildcards
|
||||
assert not dremio_api.should_include_container(["prod", "data"], "sales")
|
Loading…
x
Reference in New Issue
Block a user