diff --git a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py index cfaa47379ef..a7a7653ba47 100644 --- a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py +++ b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/client.py @@ -23,6 +23,7 @@ from pydantic import BaseModel, ConfigDict from metadata.generated.schema.entity.services.connections.dashboard.powerBIConnection import ( PowerBIConnection, ) +from metadata.generated.schema.type.filterPattern import FilterPattern from metadata.ingestion.api.steps import InvalidSourceException from metadata.ingestion.ometa.client import REST, ClientConfig from metadata.ingestion.source.dashboard.powerbi.file_client import PowerBiFileClient @@ -43,6 +44,7 @@ from metadata.ingestion.source.dashboard.powerbi.models import ( Workspaces, WorkSpaceScanResponse, ) +from metadata.utils.filters import validate_regex from metadata.utils.logger import utils_logger logger = utils_logger() @@ -253,8 +255,91 @@ class PowerBiApiClient: return None + def regex_to_odata_condition(self, regex: str) -> str: + """ + Convert a regex pattern to an OData filter condition + """ + try: + # Handle empty pattern + if not regex: + return "" + + # Exact match + if regex.startswith("^") and regex.endswith("$"): + literal = regex[1:-1] + return f"trim(name) eq '{literal}'" + + # Starts with + if regex.startswith("^"): + remaining = regex[1:] + parts = remaining.split(".*", 1) + literal = parts[0] if parts else "" + return f"startswith(name, '{literal}')" + + # Ends with + if regex.endswith("$"): + remaining = regex[:-1] + parts = remaining.split(".*") + literal = parts[-1] if parts else "" + return f"endswith(name, '{literal}')" + + # Contains + if regex.startswith(".*") and regex.endswith(".*"): + parts = regex.split(".*") + literal = parts[1] if len(parts) > 1 else "" + else: + literal = regex + return f"contains(name, '{literal}')" + except Exception as exc: + logger.warning( + f"Error converting regex '{regex}' to OData condition: {exc}" + ) + return "" + + def create_filter_query(self, filter_pattern) -> Optional[str]: + """ + Create a complete filter query for workspaces from filter_pattern + """ + try: + + validate_regex(filter_pattern.includes) + validate_regex(filter_pattern.excludes) + project_to_include = filter_pattern.includes + project_to_exclude = filter_pattern.excludes + filter_conditions = [] + if project_to_include: + include_conditions = [] + for pattern in project_to_include: + condition = self.regex_to_odata_condition(pattern) + if condition: + include_conditions.append(f"{condition}") + + if include_conditions: + filter_conditions.append(f"{' or '.join(include_conditions)}") + + if project_to_exclude: + exclude_conditions = [] + for pattern in project_to_exclude: + condition = self.regex_to_odata_condition(pattern) + if condition: + exclude_conditions.append(f"not({condition})") + + if exclude_conditions: + filter_conditions.append(f"{' and '.join(exclude_conditions)}") + + filter_query = " and ".join(filter_conditions) if filter_conditions else "" + return filter_query if filter_query else None + except Exception as exc: + logger.warning( + f"Creating filter query from the project filter pattern failed: {exc}. " + "The projects will be filtered further inside OpenMetadata." + ) + return None + # pylint: disable=too-many-branches,too-many-statements - def fetch_all_workspaces(self) -> Optional[List[Group]]: + def fetch_all_workspaces( + self, filter_pattern: Optional[FilterPattern] = None + ) -> Optional[List[Group]]: """Method to fetch all powerbi workspace details Returns: Group @@ -293,6 +378,9 @@ class PowerBiApiClient: "$top": str(entities_per_page), "$skip": str(index * entities_per_page), } + if filter_pattern: + params_data["$filter"] = self.create_filter_query(filter_pattern) + response = self.client.get(api_url, data=params_data) if ( not response diff --git a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/metadata.py b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/metadata.py index 6a0aeee8af8..5eecaa50fc0 100644 --- a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/metadata.py +++ b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/metadata.py @@ -132,7 +132,8 @@ class PowerbiSource(DashboardServiceSource): """ fetch all the group workspace ids """ - groups = self.client.api_client.fetch_all_workspaces() + filter_pattern = self.source_config.projectFilterPattern + groups = self.client.api_client.fetch_all_workspaces(filter_pattern) for group in groups: # add the dashboards to the groups group.dashboards.extend( @@ -171,7 +172,8 @@ class PowerbiSource(DashboardServiceSource): fetch all the workspace ids """ groups = [] - workspaces = self.client.api_client.fetch_all_workspaces() + filter_pattern = self.source_config.projectFilterPattern + workspaces = self.client.api_client.fetch_all_workspaces(filter_pattern) if workspaces: workspace_id_list = [workspace.id for workspace in workspaces] diff --git a/ingestion/tests/unit/test_powerbi_filter_query.py b/ingestion/tests/unit/test_powerbi_filter_query.py new file mode 100644 index 00000000000..f82d028e051 --- /dev/null +++ b/ingestion/tests/unit/test_powerbi_filter_query.py @@ -0,0 +1,60 @@ +import pytest + +from metadata.generated.schema.type.filterPattern import FilterPattern +from metadata.ingestion.source.dashboard.powerbi.client import PowerBiApiClient + +# Test cases dictionary +test_cases = { + "exact_match": { + "input": FilterPattern(includes=["^exact_workspace$"], excludes=[]), + "expected": "trim(name) eq 'exact_workspace'", + }, + "starts_with": { + "input": FilterPattern(includes=["^dev.*"], excludes=[]), + "expected": "startswith(name, 'dev')", + }, + "ends_with": { + "input": FilterPattern(includes=[".*prod$"], excludes=[]), + "expected": "endswith(name, 'prod')", + }, + "contains": { + "input": FilterPattern(includes=[".*test.*"], excludes=[]), + "expected": "contains(name, 'test')", + }, + "multiple_includes": { + "input": FilterPattern(includes=["^dev.*", ".*prod$"], excludes=[]), + "expected": "startswith(name, 'dev') or endswith(name, 'prod')", + }, + "multiple_excludes": { + "input": FilterPattern(includes=[], excludes=["^test.*", ".*temp$"]), + "expected": "not(startswith(name, 'test')) and not(endswith(name, 'temp'))", + }, + "includes_and_excludes": { + "input": FilterPattern(includes=["^prod.*"], excludes=[".*temp$"]), + "expected": "startswith(name, 'prod') and not(endswith(name, 'temp'))", + }, + "includes_without_regex": { + "input": FilterPattern(includes=["test"], excludes=[]), + "expected": "contains(name, 'test')", + }, + "excludes_withour_regex": { + "input": FilterPattern(includes=[], excludes=["test"]), + "expected": "not(contains(name, 'test'))", + }, + "empty_patterns": { + "input": FilterPattern(includes=[], excludes=[]), + "expected": None, + }, +} + +# Mock class that inherits from PowerBiApiClient +class MockPowerBiApiClient(PowerBiApiClient): + def __init__(self): + pass + + +@pytest.mark.parametrize("test_name,test_data", test_cases.items()) +def test_filter_query(test_name, test_data): + client = MockPowerBiApiClient() + result = client.create_filter_query(test_data["input"]) + assert result == test_data["expected"], f"Failed test: {test_name}"