feat(ingestion/tableau): introduce project_path_pattern (#10855)

Co-authored-by: Yanik Häni <Yanik.Haeni1@swisscom.com>
This commit is contained in:
haeniya 2024-09-27 18:46:39 +02:00 committed by GitHub
parent 1a73c664f0
commit 99bfcefb72
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 645 additions and 26 deletions

View File

@ -313,12 +313,22 @@ class TableauConfig(
# Tableau project pattern
project_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
description="Filter for specific Tableau projects. For example, use 'My Project' to ingest a root-level Project with name 'My Project', or 'My Project/Nested Project' to ingest a nested Project with name 'Nested Project'. "
description="[deprecated] Use project_path_pattern instead. Filter for specific Tableau projects. For example, use 'My Project' to ingest a root-level Project with name 'My Project', or 'My Project/Nested Project' to ingest a nested Project with name 'Nested Project'. "
"By default, all Projects nested inside a matching Project will be included in ingestion. "
"You can both allow and deny projects based on their name using their name, or a Regex pattern. "
"Deny patterns always take precedence over allow patterns. "
"By default, all projects will be ingested.",
)
_deprecate_projects_pattern = pydantic_field_deprecated("project_pattern")
project_path_pattern: AllowDenyPattern = Field(
default=AllowDenyPattern.allow_all(),
description="Filters Tableau projects by their full path. For instance, 'My Project/Nested Project' targets a specific nested project named 'Nested Project'."
" This is also useful when you need to exclude all nested projects under a particular project."
" You can allow or deny projects by specifying their path or a regular expression pattern."
" Deny patterns always override allow patterns."
" By default, all projects are ingested.",
)
project_path_separator: str = Field(
default="/",
@ -454,17 +464,23 @@ class TableauConfig(
def projects_backward_compatibility(cls, values: Dict) -> Dict:
projects = values.get("projects")
project_pattern = values.get("project_pattern")
if project_pattern is None and projects:
project_path_pattern = values.get("project_path_pattern")
if project_pattern is None and project_path_pattern is None and projects:
logger.warning(
"project_pattern is not set but projects is set. projects is deprecated, please use "
"project_pattern instead."
"projects is deprecated, please use " "project_path_pattern instead."
)
logger.info("Initializing project_pattern from projects")
values["project_pattern"] = AllowDenyPattern(
allow=[f"^{prj}$" for prj in projects]
)
elif project_pattern != AllowDenyPattern.allow_all() and projects:
raise ValueError("projects is deprecated. Please use project_pattern only.")
elif (project_pattern or project_path_pattern) and projects:
raise ValueError(
"projects is deprecated. Please use project_path_pattern only."
)
elif project_path_pattern and project_pattern:
raise ValueError(
"project_pattern is deprecated. Please use project_path_pattern only."
)
return values
@ -850,12 +866,13 @@ class TableauSiteSource:
def _is_allowed_project(self, project: TableauProject) -> bool:
# Either project name or project path should exist in allow
is_allowed: bool = self.config.project_pattern.allowed(
project.name
) or self.config.project_pattern.allowed(self._get_project_path(project))
is_allowed: bool = (
self.config.project_pattern.allowed(project.name)
or self.config.project_pattern.allowed(self._get_project_path(project))
) and self.config.project_path_pattern.allowed(self._get_project_path(project))
if is_allowed is False:
logger.info(
f"project({project.name}) is not allowed as per project_pattern"
f"Project ({project.name}) is not allowed as per project_pattern or project_path_pattern"
)
return is_allowed
@ -887,28 +904,29 @@ class TableauSiteSource:
logger.debug(f"Project {project.name} is added in project registry")
projects_to_ingest[project.id] = project
# We rely on automatic browse paths (v2) when creating containers. That's why we need to sort the projects here.
# Otherwise, nested projects will not have the correct browse paths if not created in correct order / hierarchy.
self.tableau_project_registry = OrderedDict(
sorted(projects_to_ingest.items(), key=lambda item: len(item[1].path))
)
if self.config.extract_project_hierarchy is False:
logger.debug(
"Skipping project hierarchy processing as configuration extract_project_hierarchy is "
"disabled"
)
return
else:
logger.debug(
"Reevaluating projects as extract_project_hierarchy is enabled"
)
logger.debug("Reevaluating projects as extract_project_hierarchy is enabled")
for project in list_of_skip_projects:
if (
project.parent_id in projects_to_ingest
and self._is_denied_project(project) is False
):
logger.debug(f"Project {project.name} is added in project registry")
projects_to_ingest[project.id] = project
for project in list_of_skip_projects:
if (
project.parent_id in self.tableau_project_registry
and self._is_denied_project(project) is False
):
logger.debug(f"Project {project.name} is added in project registry")
self.tableau_project_registry[project.id] = project
# We rely on automatic browse paths (v2) when creating containers. That's why we need to sort the projects here.
# Otherwise, nested projects will not have the correct browse paths if not created in correct order / hierarchy.
self.tableau_project_registry = OrderedDict(
sorted(projects_to_ingest.items(), key=lambda item: len(item[1].path))
)
def _init_datasource_registry(self) -> None:
if self.server is None:

View File

@ -0,0 +1,352 @@
[
{
"entityType": "container",
"entityUrn": "urn:li:container:5ec314b9630974ec084f5dfd3849f87b",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
"json": {
"customProperties": {
"platform": "tableau",
"project_id": "190a6a5c-63ed-4de1-8045-faeae5df5b01"
},
"name": "default"
}
},
"systemMetadata": {
"lastObserved": 1727349368101,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:5ec314b9630974ec084f5dfd3849f87b",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
},
"systemMetadata": {
"lastObserved": 1727349368102,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:5ec314b9630974ec084f5dfd3849f87b",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
"json": {
"platform": "urn:li:dataPlatform:tableau"
}
},
"systemMetadata": {
"lastObserved": 1727349368103,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:5ec314b9630974ec084f5dfd3849f87b",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Project"
]
}
},
"systemMetadata": {
"lastObserved": 1727349368104,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:5ec314b9630974ec084f5dfd3849f87b",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": []
}
},
"systemMetadata": {
"lastObserved": 1727349368105,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:beaddce9d1e89ab503ae6408fb77d4ce",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
"json": {
"customProperties": {
"platform": "tableau",
"project_id": "79d02655-88e5-45a6-9f9b-eeaf5fe54903"
},
"name": "DenyProject"
}
},
"systemMetadata": {
"lastObserved": 1727349368108,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:beaddce9d1e89ab503ae6408fb77d4ce",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
},
"systemMetadata": {
"lastObserved": 1727349368109,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:beaddce9d1e89ab503ae6408fb77d4ce",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
"json": {
"platform": "urn:li:dataPlatform:tableau"
}
},
"systemMetadata": {
"lastObserved": 1727349368109,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:beaddce9d1e89ab503ae6408fb77d4ce",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Project"
]
}
},
"systemMetadata": {
"lastObserved": 1727349368110,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:beaddce9d1e89ab503ae6408fb77d4ce",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:5ec314b9630974ec084f5dfd3849f87b"
}
},
"systemMetadata": {
"lastObserved": 1727349368111,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:beaddce9d1e89ab503ae6408fb77d4ce",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:5ec314b9630974ec084f5dfd3849f87b",
"urn": "urn:li:container:5ec314b9630974ec084f5dfd3849f87b"
}
]
}
},
"systemMetadata": {
"lastObserved": 1727349368112,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:595877512935338b94eac9e06cf20607",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
"json": {
"customProperties": {
"platform": "tableau",
"workbook_id": "ee012e36-d916-4c21-94ab-f0d66736af4e"
},
"externalUrl": "https://do-not-connect/#/site/acryl/workbooks/17904",
"name": "Deny Pattern WorkBook",
"description": ""
}
},
"systemMetadata": {
"lastObserved": 1727349368113,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:595877512935338b94eac9e06cf20607",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
},
"systemMetadata": {
"lastObserved": 1727349368114,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:595877512935338b94eac9e06cf20607",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
"json": {
"platform": "urn:li:dataPlatform:tableau"
}
},
"systemMetadata": {
"lastObserved": 1727349368115,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:595877512935338b94eac9e06cf20607",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Workbook"
]
}
},
"systemMetadata": {
"lastObserved": 1727349368116,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:595877512935338b94eac9e06cf20607",
"changeType": "UPSERT",
"aspectName": "ownership",
"aspect": {
"json": {
"owners": [
{
"owner": "urn:li:corpuser:jawadqu@gmail.com",
"type": "DATAOWNER"
}
],
"ownerTypes": {},
"lastModified": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
}
}
},
"systemMetadata": {
"lastObserved": 1727349368117,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:595877512935338b94eac9e06cf20607",
"changeType": "UPSERT",
"aspectName": "container",
"aspect": {
"json": {
"container": "urn:li:container:beaddce9d1e89ab503ae6408fb77d4ce"
}
},
"systemMetadata": {
"lastObserved": 1727349368118,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:595877512935338b94eac9e06cf20607",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": [
{
"id": "urn:li:container:5ec314b9630974ec084f5dfd3849f87b",
"urn": "urn:li:container:5ec314b9630974ec084f5dfd3849f87b"
},
{
"id": "urn:li:container:beaddce9d1e89ab503ae6408fb77d4ce",
"urn": "urn:li:container:beaddce9d1e89ab503ae6408fb77d4ce"
}
]
}
},
"systemMetadata": {
"lastObserved": 1727349368118,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
}
]

View File

@ -0,0 +1,184 @@
[
{
"entityType": "container",
"entityUrn": "urn:li:container:252a054d4dd93cd657735aa46dd71370",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
"json": {
"customProperties": {
"platform": "tableau",
"project_id": "c30aafe5-44f4-4f28-80d3-d181010a263c"
},
"name": "Project 2"
}
},
"systemMetadata": {
"lastObserved": 1727349368232,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:252a054d4dd93cd657735aa46dd71370",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
},
"systemMetadata": {
"lastObserved": 1727349368233,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:252a054d4dd93cd657735aa46dd71370",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
"json": {
"platform": "urn:li:dataPlatform:tableau"
}
},
"systemMetadata": {
"lastObserved": 1727349368233,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:252a054d4dd93cd657735aa46dd71370",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Project"
]
}
},
"systemMetadata": {
"lastObserved": 1727349368234,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:252a054d4dd93cd657735aa46dd71370",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": []
}
},
"systemMetadata": {
"lastObserved": 1727349368235,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:d2dcd6bd1bb954d62f1cfc68332ee873",
"changeType": "UPSERT",
"aspectName": "containerProperties",
"aspect": {
"json": {
"customProperties": {
"platform": "tableau",
"project_id": "910733aa-2e95-4ac3-a2e8-71570751099d"
},
"name": "Samples"
}
},
"systemMetadata": {
"lastObserved": 1727349368238,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:d2dcd6bd1bb954d62f1cfc68332ee873",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": false
}
},
"systemMetadata": {
"lastObserved": 1727349368239,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:d2dcd6bd1bb954d62f1cfc68332ee873",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
"json": {
"platform": "urn:li:dataPlatform:tableau"
}
},
"systemMetadata": {
"lastObserved": 1727349368239,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:d2dcd6bd1bb954d62f1cfc68332ee873",
"changeType": "UPSERT",
"aspectName": "subTypes",
"aspect": {
"json": {
"typeNames": [
"Project"
]
}
},
"systemMetadata": {
"lastObserved": 1727349368240,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
},
{
"entityType": "container",
"entityUrn": "urn:li:container:d2dcd6bd1bb954d62f1cfc68332ee873",
"changeType": "UPSERT",
"aspectName": "browsePathsV2",
"aspect": {
"json": {
"path": []
}
},
"systemMetadata": {
"lastObserved": 1727349368241,
"runId": "tableau-test",
"lastRunId": "no-run-id-provided",
"pipelineName": "tableau-test-pipeline"
}
}
]

View File

@ -545,7 +545,72 @@ def test_value_error_projects_and_project_pattern(
pipeline_config=new_config,
)
except Exception as e:
assert "projects is deprecated. Please use project_pattern only" in str(e)
assert "projects is deprecated. Please use project_path_pattern only" in str(e)
def test_project_pattern_deprecation(pytestconfig, tmp_path, mock_datahub_graph):
# Ingestion should raise ValueError
output_file_name: str = "tableau_project_pattern_deprecation_mces.json"
golden_file_name: str = "tableau_project_pattern_deprecation_mces_golden.json"
new_config = config_source_default.copy()
del new_config["projects"]
new_config["project_pattern"] = {"allow": ["^Samples$"]}
new_config["project_path_pattern"] = {"allow": ["^Samples$"]}
try:
tableau_ingest_common(
pytestconfig,
tmp_path,
mock_data(),
golden_file_name,
output_file_name,
mock_datahub_graph,
pipeline_config=new_config,
)
except Exception as e:
assert (
"project_pattern is deprecated. Please use project_path_pattern only"
in str(e)
)
def test_project_path_pattern_allow(pytestconfig, tmp_path, mock_datahub_graph):
output_file_name: str = "tableau_project_path_pattern_allow_mces.json"
golden_file_name: str = "tableau_project_path_pattern_allow_mces_golden.json"
new_config = config_source_default.copy()
del new_config["projects"]
new_config["project_path_pattern"] = {"allow": ["default/DenyProject"]}
tableau_ingest_common(
pytestconfig,
tmp_path,
mock_data(),
golden_file_name,
output_file_name,
mock_datahub_graph,
pipeline_config=new_config,
)
def test_project_path_pattern_deny(pytestconfig, tmp_path, mock_datahub_graph):
output_file_name: str = "tableau_project_path_pattern_deny_mces.json"
golden_file_name: str = "tableau_project_path_pattern_deny_mces_golden.json"
new_config = config_source_default.copy()
del new_config["projects"]
new_config["project_path_pattern"] = {"deny": ["^default.*"]}
tableau_ingest_common(
pytestconfig,
tmp_path,
mock_data(),
golden_file_name,
output_file_name,
mock_datahub_graph,
pipeline_config=new_config,
)
@freeze_time(FROZEN_TIME)