fix(ingestion/dremio): Ignore filtered containers in schema allowdeny pattern (#11959)

Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
2026-01-07 15:27:05 +00:00 · 2024-12-13 09:25:31 +00:00 · 2024-12-13 09:25:31 +00:00 · 06edf23a33
commit 06edf23a33
parent 7c1d3b09ed
10 changed files with 12962 additions and 1536 deletions
--- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_api.py
@ -1,6 +1,7 @@
 import concurrent.futures
 import json
 import logging
+import re
 import warnings
 from collections import defaultdict
 from enum import Enum
@ -609,32 +610,6 @@ class DremioAPIOperations:

        return self.execute_query(query=jobs_query)

-    def get_source_by_id(self, source_id: str) -> Optional[Dict]:
-        """
-        Fetch source details by ID.
-        """
-        response = self.get(
-            url=f"/source/{source_id}",
-        )
-        return response if response else None
-
-    def get_source_for_dataset(self, schema: str, dataset: str) -> Optional[Dict]:
-        """
-        Get source information for a dataset given its schema and name.
-        """
-        dataset_id = self.get_dataset_id(schema, dataset)
-        if not dataset_id:
-            return None
-
-        catalog_entry = self.get(
-            url=f"/catalog/{dataset_id}",
-        )
-        if not catalog_entry or "path" not in catalog_entry:
-            return None
-
-        source_id = catalog_entry["path"][0]
-        return self.get_source_by_id(source_id)
-
    def get_tags_for_resource(self, resource_id: str) -> Optional[List[str]]:
        """
        Get Dremio tags for a given resource_id.
@ -673,55 +648,119 @@ class DremioAPIOperations:
            )
        return None

-    def get_containers_for_location(
-        self, resource_id: str, path: List[str]
-    ) -> List[Dict[str, str]]:
-        containers = []
+    def _check_pattern_match(
+        self,
+        pattern: str,
+        paths: List[str],
+        allow_prefix: bool = True,
+    ) -> bool:
+        """
+        Helper method to check if a pattern matches any of the paths.
+        Handles hierarchical matching where each level is matched independently.
+        Also handles prefix matching for partial paths.
+        """
+        if pattern == ".*":
+            return True

-        def traverse_path(location_id: str, entity_path: List[str]) -> List:
-            nonlocal containers
-            try:
-                response = self.get(url=f"/catalog/{location_id}")
-                if (
-                    response.get("entityType")
-                    == DremioEntityContainerType.FOLDER.value.lower()
+        # Convert the pattern to regex with proper anchoring
+        regex_pattern = pattern
+        if pattern.startswith("^"):
+            # Already has start anchor
+            regex_pattern = pattern.replace(".", r"\.")  # Escape dots
+            regex_pattern = regex_pattern.replace(
+                r"\.*", ".*"
+            )  # Convert .* to wildcard
+        else:
+            # Add start anchor and handle dots
+            regex_pattern = "^" + pattern.replace(".", r"\.").replace(r"\.*", ".*")
+
+        # Handle end matching
+        if not pattern.endswith(".*"):
+            if pattern.endswith("$"):
+                # Keep explicit end anchor
+                pass
+            elif not allow_prefix:
+                # Add end anchor for exact matching
+                regex_pattern = regex_pattern + "$"
+
+        for path in paths:
+            if re.match(regex_pattern, path, re.IGNORECASE):
+                return True
+
+        return False
+
+    def should_include_container(self, path: List[str], name: str) -> bool:
+        """
+        Helper method to check if a container should be included based on schema patterns.
+        Used by both get_all_containers and get_containers_for_location.
+        """
+        path_components = path + [name] if path else [name]
+        full_path = ".".join(path_components)
+
+        # Default allow everything case
+        if self.allow_schema_pattern == [".*"] and not self.deny_schema_pattern:
+            self.report.report_container_scanned(full_path)
+            return True
+
+        # Check deny patterns first
+        if self.deny_schema_pattern:
+            for pattern in self.deny_schema_pattern:
+                if self._check_pattern_match(
+                    pattern=pattern,
+                    paths=[full_path],
+                    allow_prefix=False,
                ):
-                    containers.append(
-                        {
-                            "id": location_id,
-                            "name": entity_path[-1],
-                            "path": entity_path[:-1],
-                            "container_type": DremioEntityContainerType.FOLDER,
-                        }
-                    )
+                    self.report.report_container_filtered(full_path)
+                    return False

-                for container in response.get("children", []):
-                    if (
-                        container.get("type")
-                        == DremioEntityContainerType.CONTAINER.value
-                    ):
-                        traverse_path(container.get("id"), container.get("path"))
+        # Check allow patterns
+        for pattern in self.allow_schema_pattern:
+            # For patterns with wildcards, check if this path is a parent of the pattern
+            if "*" in pattern:
+                pattern_parts = pattern.split(".")
+                path_parts = path_components

-            except Exception as exc:
-                logging.info(
-                    "Location {} contains no tables or views. Skipping...".format(id)
-                )
-                self.report.warning(
-                    message="Failed to get tables or views",
-                    context=f"{id}",
-                    exc=exc,
-                )
+                # If pattern has exact same number of parts, check each component
+                if len(pattern_parts) == len(path_parts):
+                    matches = True
+                    for p_part, c_part in zip(pattern_parts, path_parts):
+                        if p_part != "*" and p_part.lower() != c_part.lower():
+                            matches = False
+                            break
+                    if matches:
+                        self.report.report_container_scanned(full_path)
+                        return True
+                # Otherwise check if current path is prefix match
+                else:
+                    # Remove the trailing wildcard if present
+                    if pattern_parts[-1] == "*":
+                        pattern_parts = pattern_parts[:-1]

-            return containers
+                    for i in range(len(path_parts)):
+                        current_path = ".".join(path_parts[: i + 1])
+                        pattern_prefix = ".".join(pattern_parts[: i + 1])

-        return traverse_path(location_id=resource_id, entity_path=path)
+                        if pattern_prefix.startswith(current_path):
+                            self.report.report_container_scanned(full_path)
+                            return True
+
+            # Direct pattern matching
+            if self._check_pattern_match(
+                pattern=pattern,
+                paths=[full_path],
+                allow_prefix=True,
+            ):
+                self.report.report_container_scanned(full_path)
+                return True
+
+        self.report.report_container_filtered(full_path)
+        return False

    def get_all_containers(self):
        """
-        Query the Dremio sources API and return source information.
+        Query the Dremio sources API and return filtered source information.
        """
        containers = []
-
        response = self.get(url="/catalog")

        def process_source(source):
@ -731,34 +770,41 @@ class DremioAPIOperations:
                )

                source_config = source_resp.get("config", {})
-                if source_config.get("database"):
-                    db = source_config.get("database")
-                else:
-                    db = source_config.get("databaseName", "")
+                db = source_config.get(
+                    "database", source_config.get("databaseName", "")
+                )

-                return {
-                    "id": source.get("id"),
-                    "name": source.get("path")[0],
-                    "path": [],
-                    "container_type": DremioEntityContainerType.SOURCE,
-                    "source_type": source_resp.get("type"),
-                    "root_path": source_config.get("rootPath"),
-                    "database_name": db,
-                }
+                if self.should_include_container([], source.get("path")[0]):
+                    return {
+                        "id": source.get("id"),
+                        "name": source.get("path")[0],
+                        "path": [],
+                        "container_type": DremioEntityContainerType.SOURCE,
+                        "source_type": source_resp.get("type"),
+                        "root_path": source_config.get("rootPath"),
+                        "database_name": db,
+                    }
            else:
-                return {
-                    "id": source.get("id"),
-                    "name": source.get("path")[0],
-                    "path": [],
-                    "container_type": DremioEntityContainerType.SPACE,
-                }
+                if self.should_include_container([], source.get("path")[0]):
+                    return {
+                        "id": source.get("id"),
+                        "name": source.get("path")[0],
+                        "path": [],
+                        "container_type": DremioEntityContainerType.SPACE,
+                    }
+            return None

        def process_source_and_containers(source):
            container = process_source(source)
+            if not container:
+                return []
+
+            # Get sub-containers
            sub_containers = self.get_containers_for_location(
                resource_id=container.get("id"),
                path=[container.get("name")],
            )
+
            return [container] + sub_containers

        # Use ThreadPoolExecutor to parallelize the processing of sources
@ -771,7 +817,16 @@ class DremioAPIOperations:
            }

            for future in concurrent.futures.as_completed(future_to_source):
-                containers.extend(future.result())
+                source = future_to_source[future]
+                try:
+                    containers.extend(future.result())
+                except Exception as exc:
+                    logger.error(f"Error processing source: {exc}")
+                    self.report.warning(
+                        message="Failed to process source",
+                        context=f"{source}",
+                        exc=exc,
+                    )

        return containers

@ -785,3 +840,55 @@ class DremioAPIOperations:
            )
        else:
            return ""
+
+    def get_containers_for_location(
+        self, resource_id: str, path: List[str]
+    ) -> List[Dict[str, str]]:
+        containers = []
+
+        def traverse_path(location_id: str, entity_path: List[str]) -> List:
+            nonlocal containers
+            try:
+                response = self.get(url=f"/catalog/{location_id}")
+
+                # Check if current folder should be included
+                if (
+                    response.get("entityType")
+                    == DremioEntityContainerType.FOLDER.value.lower()
+                ):
+                    folder_name = entity_path[-1]
+                    folder_path = entity_path[:-1]
+
+                    if self.should_include_container(folder_path, folder_name):
+                        containers.append(
+                            {
+                                "id": location_id,
+                                "name": folder_name,
+                                "path": folder_path,
+                                "container_type": DremioEntityContainerType.FOLDER,
+                            }
+                        )
+
+                # Recursively process child containers
+                for container in response.get("children", []):
+                    if (
+                        container.get("type")
+                        == DremioEntityContainerType.CONTAINER.value
+                    ):
+                        traverse_path(container.get("id"), container.get("path"))
+
+            except Exception as exc:
+                logging.info(
+                    "Location {} contains no tables or views. Skipping...".format(
+                        location_id
+                    )
+                )
+                self.report.warning(
+                    message="Failed to get tables or views",
+                    context=f"{location_id}",
+                    exc=exc,
+                )
+
+            return containers
+
+        return traverse_path(location_id=resource_id, entity_path=path)
--- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_datahub_source_mapping.py
@ -31,6 +31,7 @@ class DremioToDataHubSourceTypeMapping:
        "SNOWFLAKE": "snowflake",
        "SYNAPSE": "mssql",
        "TERADATA": "teradata",
+        "VERTICA": "vertica",
    }

    DATABASE_SOURCE_TYPES = {
@ -52,6 +53,7 @@ class DremioToDataHubSourceTypeMapping:
        "SNOWFLAKE",
        "SYNAPSE",
        "TERADATA",
+        "VERTICA",
    }

    FILE_OBJECT_STORAGE_TYPES = {
--- a/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dremio/dremio_reporting.py
@ -14,12 +14,27 @@ class DremioSourceReport(
 ):
    num_containers_failed: int = 0
    num_datasets_failed: int = 0
+    containers_scanned: int = 0
+    containers_filtered: int = 0

    def report_upstream_latency(self, start_time: datetime, end_time: datetime) -> None:
        # recording total combined latency is not very useful, keeping this method as a placeholder
        # for future implementation of min / max / percentiles etc.
        pass

+    def report_container_scanned(self, name: str) -> None:
+        """
+        Record that a container was successfully scanned
+        """
+        self.containers_scanned += 1
+
+    def report_container_filtered(self, container_name: str) -> None:
+        """
+        Record that a container was filtered out
+        """
+        self.containers_filtered += 1
+        self.report_dropped(container_name)
+
    def report_entity_scanned(self, name: str, ent_type: str = "View") -> None:
        """
        Entity could be a view or a table
--- a/metadata-ingestion/tests/integration/dremio/dremio_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dremio/dremio_mces_golden.json
--- a/metadata-ingestion/tests/integration/dremio/dremio_platform_instance_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dremio/dremio_platform_instance_mces_golden.json
--- a/metadata-ingestion/tests/integration/dremio/dremio_platform_instance_to_file.yml
+++ b/metadata-ingestion/tests/integration/dremio/dremio_platform_instance_to_file.yml
@ -0,0 +1,26 @@
+source:
+  type: dremio
+  config:
+    # Coordinates
+    hostname: localhost
+    port: 9047
+    tls: false
+
+    # Credentials
+    authentication_method: password
+    username: admin
+    password: "2310Admin1234!@"
+
+    platform_instance: test-platform
+
+    include_query_lineage: false
+
+    source_mappings:
+      - platform: s3
+        source_name: samples
+        platform_instance: s3_test_samples
+
+sink:
+  type: file
+  config:
+    filename: "./dremio_mces.json"
--- a/metadata-ingestion/tests/integration/dremio/dremio_schema_filter_mces_golden.json
+++ b/metadata-ingestion/tests/integration/dremio/dremio_schema_filter_mces_golden.json
--- a/metadata-ingestion/tests/integration/dremio/dremio_schema_filter_to_file.yml
+++ b/metadata-ingestion/tests/integration/dremio/dremio_schema_filter_to_file.yml
@ -0,0 +1,28 @@
+source:
+  type: dremio
+  config:
+    # Coordinates
+    hostname: localhost
+    port: 9047
+    tls: false
+
+    # Credentials
+    authentication_method: password
+    username: admin
+    password: "2310Admin1234!@"
+
+    include_query_lineage: false
+
+    source_mappings:
+      - platform: s3
+        source_name: samples
+        platform_instance: s3_test_samples
+
+    schema_pattern:
+      allow:
+        - "Samples"
+
+sink:
+  type: file
+  config:
+    filename: "./dremio_mces.json"
--- a/metadata-ingestion/tests/integration/dremio/test_dremio.py
+++ b/metadata-ingestion/tests/integration/dremio/test_dremio.py
@ -1,6 +1,7 @@
 import json
 import os
 import subprocess
+from typing import Dict

 import boto3
 import pytest
@ -75,9 +76,10 @@ def create_spaces_and_folders(headers):


 def create_sample_source(headers):
-    url = f"{DREMIO_HOST}/apiv2/source/Samples"
+    url = f"{DREMIO_HOST}/api/v3/catalog"

    payload = {
+        "entityType": "source",
        "config": {
            "externalBucketList": ["samples.dremio.com"],
            "credentialType": "NONE",
@ -95,14 +97,15 @@ def create_sample_source(headers):
        "type": "S3",
    }

-    response = requests.put(url, headers=headers, data=json.dumps(payload))
+    response = requests.post(url, headers=headers, data=json.dumps(payload))
    assert response.status_code == 200, f"Failed to add dataset: {response.text}"


 def create_s3_source(headers):
-    url = f"{DREMIO_HOST}/apiv2/source/s3"
+    url = f"{DREMIO_HOST}/api/v3/catalog"

    payload = {
+        "entityType": "source",
        "name": "s3",
        "config": {
            "credentialType": "ACCESS_KEY",
@ -139,24 +142,25 @@ def create_s3_source(headers):
        "metadataPolicy": {
            "deleteUnavailableDatasets": True,
            "autoPromoteDatasets": False,
-            "namesRefreshMillis": 3600000,
-            "datasetDefinitionRefreshAfterMillis": 3600000,
-            "datasetDefinitionExpireAfterMillis": 10800000,
-            "authTTLMillis": 86400000,
-            "updateMode": "PREFETCH_QUERIED",
+            "namesRefreshMs": 3600000,
+            "datasetRefreshAfterMs": 3600000,
+            "datasetExpireAfterMs": 10800000,
+            "authTTLMs": 86400000,
+            "datasetUpdateMode": "PREFETCH_QUERIED",
        },
        "type": "S3",
        "accessControlList": {"userControls": [], "roleControls": []},
    }

-    response = requests.put(url, headers=headers, data=json.dumps(payload))
+    response = requests.post(url, headers=headers, data=json.dumps(payload))
    assert response.status_code == 200, f"Failed to add s3 datasource: {response.text}"


 def create_mysql_source(headers):
-    url = f"{DREMIO_HOST}/apiv2/source/mysql"
+    url = f"{DREMIO_HOST}/api/v3/catalog"

    payload = {
+        "entityType": "source",
        "config": {
            "username": "root",
            "password": "rootpwd123",
@ -169,7 +173,7 @@ def create_mysql_source(headers):
            "maxIdleConns": 8,
            "idleTimeSec": 60,
        },
-        "name": "mysql-source",
+        "name": "mysql",
        "accelerationRefreshPeriod": 3600000,
        "accelerationGracePeriod": 10800000,
        "accelerationActivePolicyType": "PERIOD",
@ -177,72 +181,121 @@ def create_mysql_source(headers):
        "accelerationRefreshOnDataChanges": False,
        "metadataPolicy": {
            "deleteUnavailableDatasets": True,
-            "namesRefreshMillis": 3600000,
-            "datasetDefinitionRefreshAfterMillis": 3600000,
-            "datasetDefinitionExpireAfterMillis": 10800000,
-            "authTTLMillis": 86400000,
-            "updateMode": "PREFETCH_QUERIED",
+            "namesRefreshMs": 3600000,
+            "datasetRefreshAfterMs": 3600000,
+            "datasetExpireAfterMs": 10800000,
+            "authTTLMs": 86400000,
+            "datasetUpdateMode": "PREFETCH_QUERIED",
        },
        "type": "MYSQL",
    }
-    response = requests.put(url, headers=headers, data=json.dumps(payload))
+    response = requests.post(url, headers=headers, data=json.dumps(payload))
    assert (
        response.status_code == 200
    ), f"Failed to add mysql datasource: {response.text}"


 def upload_dataset(headers):
-    url = f"{DREMIO_HOST}/apiv2/source/s3/file_format/warehouse/sample.parquet"
-    payload = {"ignoreOtherFileFormats": False, "type": "Parquet"}
-
-    response = requests.put(url, headers=headers, data=json.dumps(payload))
-    assert response.status_code == 200, f"Failed to add dataset: {response.text}"
-
-    url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/NYC-weather.csv"
-
+    url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2Fs3%2Fwarehouse"
    payload = {
-        "fieldDelimiter": ",",
-        "quote": '"',
-        "comment": "#",
-        "lineDelimiter": "\r\n",
-        "escape": '"',
-        "extractHeader": False,
-        "trimHeader": True,
-        "skipFirstLine": False,
-        "type": "Text",
+        "entityType": "dataset",
+        "type": "PHYSICAL_DATASET",
+        "path": [
+            "s3",
+            "warehouse",
+        ],
+        "format": {"type": "Parquet"},
    }

-    response = requests.put(url, headers=headers, data=json.dumps(payload))
+    response = requests.post(url, headers=headers, data=json.dumps(payload))
    assert response.status_code == 200, f"Failed to add dataset: {response.text}"

-    url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/Dremio%20University/oracle-departments.xlsx"
-
-    payload = {"extractHeader": True, "hasMergedCells": False, "type": "Excel"}
-
-    response = requests.put(url, headers=headers, data=json.dumps(payload))
-    assert response.status_code == 200, f"Failed to add dataset: {response.text}"
-
-    url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/Dremio%20University/googleplaystore.csv"
+    url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2FNYC-weather.csv"

    payload = {
-        "fieldDelimiter": ",",
-        "quote": '"',
-        "comment": "#",
-        "lineDelimiter": "\r\n",
-        "escape": '"',
-        "extractHeader": False,
-        "trimHeader": True,
-        "skipFirstLine": False,
-        "type": "Text",
+        "entityType": "dataset",
+        "type": "PHYSICAL_DATASET",
+        "path": [
+            "Samples",
+            "samples.dremio.com",
+            "NYC-weather.csv",
+        ],
+        "format": {
+            "fieldDelimiter": ",",
+            "quote": '"',
+            "comment": "#",
+            "lineDelimiter": "\r\n",
+            "escape": '"',
+            "extractHeader": False,
+            "trimHeader": True,
+            "skipFirstLine": False,
+            "type": "Text",
+        },
    }

-    response = requests.put(url, headers=headers, data=json.dumps(payload))
+    response = requests.post(url, headers=headers, data=json.dumps(payload))
    assert response.status_code == 200, f"Failed to add dataset: {response.text}"

-    url = f"{DREMIO_HOST}/apiv2/source/Samples/file_format/samples.dremio.com/tpcds_sf1000/catalog_page/1ab266d5-18eb-4780-711d-0fa337fa6c00/0_0_0.parquet"
-    payload = {"ignoreOtherFileFormats": False, "type": "Parquet"}
+    url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2FDremio%20University%2Foracle-departments.xlsx"

-    response = requests.put(url, headers=headers, data=json.dumps(payload))
+    payload = {
+        "entityType": "dataset",
+        "type": "PHYSICAL_DATASET",
+        "path": [
+            "Samples",
+            "samples.dremio.com",
+            "Dremio University",
+            "oracle-departments.xlsx",
+        ],
+        "format": {"extractHeader": True, "hasMergedCells": False, "type": "Excel"},
+    }
+
+    response = requests.post(url, headers=headers, data=json.dumps(payload))
+    assert response.status_code == 200, f"Failed to add dataset: {response.text}"
+
+    url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2FDremio%20University%2Fgoogleplaystore.csv"
+
+    payload = {
+        "entityType": "dataset",
+        "type": "PHYSICAL_DATASET",
+        "path": [
+            "Samples",
+            "samples.dremio.com",
+            "Dremio University",
+            "googleplaystore.csv",
+        ],
+        "format": {
+            "fieldDelimiter": ",",
+            "quote": '"',
+            "comment": "#",
+            "lineDelimiter": "\r\n",
+            "escape": '"',
+            "extractHeader": False,
+            "trimHeader": True,
+            "skipFirstLine": False,
+            "type": "Text",
+        },
+    }
+
+    response = requests.post(url, headers=headers, data=json.dumps(payload))
+    assert response.status_code == 200, f"Failed to add dataset: {response.text}"
+
+    url = f"{DREMIO_HOST}/api/v3/catalog/dremio%3A%2FSamples%2Fsamples.dremio.com%2Ftpcds_sf1000%2Fcatalog_page%2F1ab266d5-18eb-4780-711d-0fa337fa6c00%2F0_0_0.parquet"
+    payload = {
+        "entityType": "dataset",
+        "type": "PHYSICAL_DATASET",
+        "path": [
+            "Samples",
+            "samples.dremio.com",
+            "tpcds_sf1000",
+            "catalog_page",
+            "1ab266d5-18eb-4780-711d-0fa337fa6c00",
+            "0_0_0.parquet",
+        ],
+        "format": {"type": "Parquet"},
+    }
+
+    response = requests.post(url, headers=headers, data=json.dumps(payload))
    assert response.status_code == 200, f"Failed to add dataset: {response.text}"


@ -253,7 +306,7 @@ def create_view(headers):
        "entityType": "dataset",
        "type": "VIRTUAL_DATASET",
        "path": ["space", "test_folder", "raw"],
-        "sql": 'SELECT * FROM s3.warehouse."sample.parquet"',
+        "sql": "SELECT * FROM s3.warehouse",
    }
    response = requests.post(url, headers=headers, data=json.dumps(payload))
    assert response.status_code == 200, f"Failed to create view: {response.text}"
@ -273,7 +326,7 @@ def create_view(headers):
        "entityType": "dataset",
        "type": "VIRTUAL_DATASET",
        "path": ["space", "test_folder", "customers"],
-        "sql": 'SELECT * FROM "mysql".northwind.customers',
+        "sql": "SELECT * FROM mysql.northwind.customers",
        "sqlContext": ["mysql", "northwind"],
    }
    response = requests.post(url, headers=headers, data=json.dumps(payload))
@ -283,7 +336,7 @@ def create_view(headers):
        "entityType": "dataset",
        "type": "VIRTUAL_DATASET",
        "path": ["space", "test_folder", "orders"],
-        "sql": 'SELECT * FROM "mysql".northwind.orders',
+        "sql": "SELECT * FROM mysql.northwind.orders",
        "sqlContext": ["mysql", "northwind"],
    }
    response = requests.post(url, headers=headers, data=json.dumps(payload))
@ -293,7 +346,7 @@ def create_view(headers):
        "entityType": "dataset",
        "type": "VIRTUAL_DATASET",
        "path": ["space", "test_folder", "metadata_aspect"],
-        "sql": 'SELECT * FROM "mysql".metagalaxy."metadata_aspect"',
+        "sql": "SELECT * FROM mysql.metagalaxy.metadata_aspect",
        "sqlContext": ["mysql", "metagalaxy"],
    }
    response = requests.post(url, headers=headers, data=json.dumps(payload))
@ -303,7 +356,7 @@ def create_view(headers):
        "entityType": "dataset",
        "type": "VIRTUAL_DATASET",
        "path": ["space", "test_folder", "metadata_index"],
-        "sql": 'SELECT * FROM "mysql".metagalaxy."metadata_index"',
+        "sql": "SELECT * FROM mysql.metagalaxy.metadata_index",
        "sqlContext": ["mysql", "metagalaxy"],
    }
    response = requests.post(url, headers=headers, data=json.dumps(payload))
@ -313,7 +366,7 @@ def create_view(headers):
        "entityType": "dataset",
        "type": "VIRTUAL_DATASET",
        "path": ["space", "test_folder", "metadata_index_view"],
-        "sql": 'SELECT * FROM "mysql".metagalaxy."metadata_index_view"',
+        "sql": "SELECT * FROM mysql.metagalaxy.metadata_index_view",
        "sqlContext": ["mysql", "metagalaxy"],
    }
    response = requests.post(url, headers=headers, data=json.dumps(payload))
@ -422,14 +475,119 @@ def test_dremio_ingest(
    pytestconfig,
    tmp_path,
 ):
-    # Run the metadata ingestion pipeline.
+    # Run the metadata ingestion pipeline with specific output file
    config_file = (test_resources_dir / "dremio_to_file.yml").resolve()
+    output_path = tmp_path / "dremio_mces.json"
+
    run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)

-    # Verify the output.
+    # Verify the output
    mce_helpers.check_golden_file(
        pytestconfig,
-        output_path=tmp_path / "dremio_mces.json",
+        output_path=output_path,
        golden_path=test_resources_dir / "dremio_mces_golden.json",
        ignore_paths=[],
    )
+
+
+@freeze_time(FROZEN_TIME)
+@pytest.mark.integration
+def test_dremio_platform_instance_urns(
+    test_resources_dir,
+    dremio_setup,
+    pytestconfig,
+    tmp_path,
+):
+    config_file = (
+        test_resources_dir / "dremio_platform_instance_to_file.yml"
+    ).resolve()
+    output_path = tmp_path / "dremio_mces.json"
+
+    run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)
+
+    with output_path.open() as f:
+        content = f.read()
+        # Skip if file is empty or just contains brackets
+        if not content or content.strip() in ("[]", "[", "]"):
+            pytest.fail(f"Output file is empty or invalid: {content}")
+
+    try:
+        # Try to load as JSON Lines first
+        mces = []
+        for line in content.splitlines():
+            line = line.strip()
+            if line and line not in ("[", "]"):  # Skip empty lines and bare brackets
+                mce = json.loads(line)
+                mces.append(mce)
+    except json.JSONDecodeError:
+        # If that fails, try loading as a single JSON array
+        try:
+            mces = json.loads(content)
+        except json.JSONDecodeError as e:
+            print(f"Failed to parse file content: {content}")
+            raise e
+
+    # Verify MCEs
+    assert len(mces) > 0, "No MCEs found in output file"
+
+    # Verify the platform instances
+    for mce in mces:
+        if "entityType" not in mce:
+            continue
+
+        # Check dataset URN structure
+        if mce["entityType"] == "dataset" and "entityUrn" in mce:
+            assert (
+                "test-platform.dremio" in mce["entityUrn"]
+            ), f"Platform instance missing in dataset URN: {mce['entityUrn']}"
+
+        # Check aspects for both datasets and containers
+        if "aspectName" in mce:
+            # Check dataPlatformInstance aspect
+            if mce["aspectName"] == "dataPlatformInstance":
+                aspect = mce["aspect"]
+                if not isinstance(aspect, Dict) or "json" not in aspect:
+                    continue
+
+                aspect_json = aspect["json"]
+                if not isinstance(aspect_json, Dict):
+                    continue
+
+                if "instance" not in aspect_json:
+                    continue
+
+                instance = aspect_json["instance"]
+                expected_instance = "urn:li:dataPlatformInstance:(urn:li:dataPlatform:dremio,test-platform)"
+                assert (
+                    instance == expected_instance
+                ), f"Invalid platform instance format: {instance}"
+
+    # Verify against golden file
+    mce_helpers.check_golden_file(
+        pytestconfig,
+        output_path=output_path,
+        golden_path=test_resources_dir / "dremio_platform_instance_mces_golden.json",
+        ignore_paths=[],
+    )
+
+
+@freeze_time(FROZEN_TIME)
+@pytest.mark.integration
+def test_dremio_schema_filter(
+    test_resources_dir,
+    dremio_setup,
+    pytestconfig,
+    tmp_path,
+):
+    config_file = (test_resources_dir / "dremio_schema_filter_to_file.yml").resolve()
+    output_path = tmp_path / "dremio_mces.json"
+
+    run_datahub_cmd(["ingest", "-c", f"{config_file}"], tmp_path=tmp_path)
+
+    # Verify against golden file
+    mce_helpers.check_golden_file(
+        pytestconfig,
+        output_path=output_path,
+        golden_path=test_resources_dir / "dremio_schema_filter_mces_golden.json",
+        ignore_paths=[],
+    )
--- a/metadata-ingestion/tests/unit/dremio/test_dremio_schema_filter.py
+++ b/metadata-ingestion/tests/unit/dremio/test_dremio_schema_filter.py
@ -0,0 +1,123 @@
+from unittest.mock import Mock
+
+import pytest
+
+from datahub.ingestion.source.dremio.dremio_api import DremioAPIOperations
+from datahub.ingestion.source.dremio.dremio_config import DremioSourceConfig
+from datahub.ingestion.source.dremio.dremio_reporting import DremioSourceReport
+
+
+class TestDremioContainerFiltering:
+    @pytest.fixture
+    def dremio_api(self, monkeypatch):
+        # Mock the requests.Session
+        mock_session = Mock()
+        monkeypatch.setattr("requests.Session", Mock(return_value=mock_session))
+
+        # Mock the authentication response
+        mock_session.post.return_value.json.return_value = {"token": "dummy-token"}
+        mock_session.post.return_value.status_code = 200
+
+        config = DremioSourceConfig(
+            hostname="dummy-host",
+            port=9047,
+            tls=False,
+            authentication_method="password",
+            username="dummy-user",
+            password="dummy-password",
+            schema_pattern=dict(allow=[".*"], deny=[]),
+        )
+        report = DremioSourceReport()
+        return DremioAPIOperations(config, report)
+
+    def test_basic_allow_pattern(self, dremio_api):
+        """Test basic allow pattern matching"""
+        dremio_api.allow_schema_pattern = ["test"]
+        dremio_api.deny_schema_pattern = []
+
+        assert dremio_api.should_include_container([], "test")
+        assert dremio_api.should_include_container(["test"], "subfolder")
+        assert not dremio_api.should_include_container([], "prod_space")
+
+    def test_basic_deny_pattern(self, dremio_api):
+        """Test basic deny pattern matching"""
+        dremio_api.allow_schema_pattern = [".*"]
+        dremio_api.deny_schema_pattern = ["test_space.*"]
+
+        assert not dremio_api.should_include_container([], "test_space")
+        assert not dremio_api.should_include_container(["test_space"], "subfolder")
+        assert dremio_api.should_include_container([], "prod_space")
+
+    def test_hierarchical_matching(self, dremio_api):
+        """Test matching with hierarchical paths"""
+        dremio_api.allow_schema_pattern = ["prod.data.*"]
+        dremio_api.deny_schema_pattern = []
+
+        assert dremio_api.should_include_container([], "prod")
+        assert dremio_api.should_include_container(["prod"], "data")
+        assert dremio_api.should_include_container(["prod", "data"], "sales")
+        assert not dremio_api.should_include_container([], "dev")
+        assert not dremio_api.should_include_container(["dev"], "data")
+
+    def test_allow_and_deny_patterns(self, dremio_api):
+        """Test combination of allow and deny patterns"""
+        dremio_api.allow_schema_pattern = ["prod.*"]
+        dremio_api.deny_schema_pattern = ["prod.internal.*"]
+
+        assert dremio_api.should_include_container([], "prod")
+        assert dremio_api.should_include_container(["prod"], "public")
+        assert dremio_api.should_include_container(["prod", "public"], "next")
+        assert not dremio_api.should_include_container(["prod"], "internal")
+        assert not dremio_api.should_include_container(["prod", "internal"], "secrets")
+
+    def test_wildcard_patterns(self, dremio_api):
+        """Test wildcard pattern handling"""
+        dremio_api.allow_schema_pattern = [".*"]
+        dremio_api.deny_schema_pattern = []
+
+        assert dremio_api.should_include_container([], "any_space")
+        assert dremio_api.should_include_container(["any_space"], "any_folder")
+
+        # Test with specific wildcard in middle
+        dremio_api.allow_schema_pattern = ["prod.*.public"]
+        assert dremio_api.should_include_container(["prod", "customer"], "public")
+        assert not dremio_api.should_include_container(["prod", "customer"], "private")
+
+    def test_case_insensitive_matching(self, dremio_api):
+        """Test case-insensitive pattern matching"""
+        dremio_api.allow_schema_pattern = ["PROD.*"]
+        dremio_api.deny_schema_pattern = []
+
+        assert dremio_api.should_include_container([], "prod")
+        assert dremio_api.should_include_container([], "PROD")
+        assert dremio_api.should_include_container(["prod"], "DATA")
+        assert dremio_api.should_include_container(["PROD"], "data")
+
+    def test_empty_patterns(self, dremio_api):
+        """Test behavior with empty patterns"""
+        dremio_api.allow_schema_pattern = [".*"]
+        dremio_api.deny_schema_pattern = []
+
+        # Should allow everything when allow pattern is empty
+        assert dremio_api.should_include_container([], "any_space")
+        assert dremio_api.should_include_container(["any_space"], "any_folder")
+
+    def test_partial_path_matching(self, dremio_api):
+        """Test matching behavior with partial paths"""
+        dremio_api.allow_schema_pattern = ["^pr.*.data.*"]
+        dremio_api.deny_schema_pattern = []
+
+        assert dremio_api.should_include_container(["prod"], "data")
+        # Should match the partial path even though pattern doesn't have wildcards
+        assert dremio_api.should_include_container(["prod", "data"], "sales")
+        assert not dremio_api.should_include_container([], "dev")
+        assert not dremio_api.should_include_container(["dev", "data"], "sales")
+
+    def test_partial_start_end_chars(self, dremio_api):
+        """Test matching behavior with partial paths"""
+        dremio_api.allow_schema_pattern = ["pr.*.data$"]
+        dremio_api.deny_schema_pattern = []
+
+        assert dremio_api.should_include_container(["prod"], "data")
+        # Should match the partial path even though pattern doesn't have wildcards
+        assert not dremio_api.should_include_container(["prod", "data"], "sales")