feat(ingest): add preset source (#10954)

Co-authored-by: MARK CHENG <hcheng@wealthsimple.com>
Co-authored-by: hwmarkcheng <94201005+hwmarkcheng@users.noreply.github.com>
This commit is contained in:
Shuixi Li 2024-10-09 23:27:31 -04:00 committed by GitHub
parent 0414443b77
commit f147b51fc8
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 1068 additions and 10 deletions

View File

@ -722,6 +722,7 @@ entry_points = {
"snowflake-summary = datahub.ingestion.source.snowflake.snowflake_summary:SnowflakeSummarySource",
"snowflake-queries = datahub.ingestion.source.snowflake.snowflake_queries:SnowflakeQueriesSource",
"superset = datahub.ingestion.source.superset:SupersetSource",
"preset = datahub.ingestion.source.preset:PresetSource",
"tableau = datahub.ingestion.source.tableau.tableau:TableauSource",
"openapi = datahub.ingestion.source.openapi:OpenApiSource",
"metabase = datahub.ingestion.source.metabase:MetabaseSource",

View File

@ -0,0 +1,114 @@
import logging
from typing import Dict, Optional
import requests
from pydantic.class_validators import root_validator, validator
from pydantic.fields import Field
from datahub.emitter.mce_builder import DEFAULT_ENV
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.api.decorators import (
SourceCapability,
SupportStatus,
capability,
config_class,
platform_name,
support_status,
)
from datahub.ingestion.source.state.stale_entity_removal_handler import (
StaleEntityRemovalSourceReport,
StatefulStaleMetadataRemovalConfig,
)
from datahub.ingestion.source.superset import SupersetConfig, SupersetSource
from datahub.utilities import config_clean
logger = logging.getLogger(__name__)
class PresetConfig(SupersetConfig):
manager_uri: str = Field(
default="https://api.app.preset.io", description="Preset.io API URL"
)
connect_uri: str = Field(default="", description="Preset workspace URL.")
display_uri: Optional[str] = Field(
default=None,
description="optional URL to use in links (if `connect_uri` is only for ingestion)",
)
api_key: Optional[str] = Field(default=None, description="Preset.io API key.")
api_secret: Optional[str] = Field(default=None, description="Preset.io API secret.")
# Configuration for stateful ingestion
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
default=None, description="Preset Stateful Ingestion Config."
)
options: Dict = Field(default={}, description="")
env: str = Field(
default=DEFAULT_ENV,
description="Environment to use in namespace when constructing URNs",
)
database_alias: Dict[str, str] = Field(
default={},
description="Can be used to change mapping for database names in superset to what you have in datahub",
)
@validator("connect_uri", "display_uri")
def remove_trailing_slash(cls, v):
return config_clean.remove_trailing_slashes(v)
@root_validator
def default_display_uri_to_connect_uri(cls, values):
base = values.get("display_uri")
if base is None:
values["display_uri"] = values.get("connect_uri")
return values
@platform_name("Preset")
@config_class(PresetConfig)
@support_status(SupportStatus.TESTING)
@capability(
SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion"
)
class PresetSource(SupersetSource):
"""
Variation of the Superset plugin that works with Preset.io (Apache Superset SaaS).
"""
config: PresetConfig
report: StaleEntityRemovalSourceReport
platform = "preset"
def __init__(self, ctx: PipelineContext, config: PresetConfig):
logger.info(f"ctx is {ctx}")
super().__init__(ctx, config)
self.config = config
self.report = StaleEntityRemovalSourceReport()
def login(self):
try:
login_response = requests.post(
f"{self.config.manager_uri}/v1/auth/",
json={"name": self.config.api_key, "secret": self.config.api_secret},
)
except requests.exceptions.RequestException as e:
logger.error(f"Failed to authenticate with Preset: {e}")
raise e
self.access_token = login_response.json()["payload"]["access_token"]
logger.debug("Got access token from Preset")
requests_session = requests.Session()
requests_session.headers.update(
{
"Authorization": f"Bearer {self.access_token}",
"Content-Type": "application/json",
"Accept": "*/*",
}
)
# Test the connection
test_response = requests_session.get(f"{self.config.connect_uri}/version")
if not test_response.ok:
logger.error("Unable to connect to workspace")
return requests_session

View File

@ -101,7 +101,11 @@ class SupersetConfig(
)
username: Optional[str] = Field(default=None, description="Superset username.")
password: Optional[str] = Field(default=None, description="Superset password.")
api_key: Optional[str] = Field(default=None, description="Preset.io API key.")
api_secret: Optional[str] = Field(default=None, description="Preset.io API secret.")
manager_uri: str = Field(
default="https://api.app.preset.io", description="Preset.io API URL"
)
# Configuration for stateful ingestion
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = Field(
default=None, description="Superset Stateful Ingestion Config."
@ -179,7 +183,14 @@ class SupersetSource(StatefulIngestionSourceBase):
super().__init__(config, ctx)
self.config = config
self.report = StaleEntityRemovalSourceReport()
if self.config.domain:
self.domain_registry = DomainRegistry(
cached_domains=[domain_id for domain_id in self.config.domain],
graph=self.ctx.graph,
)
self.session = self.login()
def login(self) -> requests.Session:
login_response = requests.post(
f"{self.config.connect_uri}/api/v1/security/login",
json={
@ -193,8 +204,8 @@ class SupersetSource(StatefulIngestionSourceBase):
self.access_token = login_response.json()["access_token"]
logger.debug("Got access token from superset")
self.session = requests.Session()
self.session.headers.update(
requests_session = requests.Session()
requests_session.headers.update(
{
"Authorization": f"Bearer {self.access_token}",
"Content-Type": "application/json",
@ -202,17 +213,14 @@ class SupersetSource(StatefulIngestionSourceBase):
}
)
if self.config.domain:
self.domain_registry = DomainRegistry(
cached_domains=[domain_id for domain_id in self.config.domain],
graph=self.ctx.graph,
)
# Test the connection
test_response = self.session.get(f"{self.config.connect_uri}/api/v1/dashboard/")
test_response = requests_session.get(
f"{self.config.connect_uri}/api/v1/dashboard/"
)
if test_response.status_code == 200:
pass
# TODO(Gabe): how should we message about this error?
return requests_session
@classmethod
def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:

View File

@ -0,0 +1,286 @@
[
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": {
"urn": "urn:li:dashboard:(preset,1)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.dashboard.DashboardInfo": {
"customProperties": {
"Status": "published",
"IsPublished": "true",
"Owners": "test_username_1, test_username_2",
"IsCertified": "true",
"CertifiedBy": "Certification team",
"CertificationDetails": "Approved"
},
"title": "test_dashboard_title_1",
"description": "",
"charts": [
"urn:li:chart:(preset,10)",
"urn:li:chart:(preset,11)"
],
"datasets": [],
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 1720594800000,
"actor": "urn:li:corpuser:test_username_1"
}
},
"dashboardUrl": "mock://mock-domain.preset.io/dashboard/test_dashboard_url_1"
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-test",
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": {
"urn": "urn:li:dashboard:(preset,2)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.dashboard.DashboardInfo": {
"customProperties": {
"Status": "draft",
"IsPublished": "false",
"Owners": "unknown",
"IsCertified": "false"
},
"title": "test_dashboard_title_2",
"description": "",
"charts": [
"urn:li:chart:(preset,12)",
"urn:li:chart:(preset,13)"
],
"datasets": [],
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 1720594800000,
"actor": "urn:li:corpuser:test_username_2"
}
},
"dashboardUrl": "mock://mock-domain.preset.io/dashboard/test_dashboard_url_2"
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-test",
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": {
"urn": "urn:li:chart:(preset,10)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.chart.ChartInfo": {
"customProperties": {
"Metrics": "",
"Filters": "",
"Dimensions": ""
},
"title": "test_chart_title_1",
"description": "",
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 1720594800000,
"actor": "urn:li:corpuser:test_username_1"
}
},
"chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_10",
"inputs": [
{
"string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)"
}
],
"type": "BAR"
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-test",
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": {
"urn": "urn:li:chart:(preset,11)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.chart.ChartInfo": {
"customProperties": {
"Metrics": "",
"Filters": "",
"Dimensions": ""
},
"title": "test_chart_title_2",
"description": "",
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 1720594800000,
"actor": "urn:li:corpuser:test_username_1"
}
},
"chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_11",
"inputs": [
{
"string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)"
}
],
"type": "PIE"
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-test",
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": {
"urn": "urn:li:chart:(preset,12)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.chart.ChartInfo": {
"customProperties": {
"Metrics": "",
"Filters": "",
"Dimensions": ""
},
"title": "test_chart_title_3",
"description": "",
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 1720594800000,
"actor": "urn:li:corpuser:test_username_2"
}
},
"chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_12",
"inputs": [
{
"string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)"
}
],
"type": "AREA"
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-test",
"lastRunId": "no-run-id-provided"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": {
"urn": "urn:li:chart:(preset,13)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.chart.ChartInfo": {
"customProperties": {
"Metrics": "",
"Filters": "",
"Dimensions": ""
},
"title": "test_chart_title_4",
"description": "",
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 1720594800000,
"actor": "urn:li:corpuser:test_username_2"
}
},
"chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_13",
"inputs": [
{
"string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)"
}
],
"type": "HISTOGRAM"
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-test",
"lastRunId": "no-run-id-provided"
}
}
]

View File

@ -0,0 +1,261 @@
[
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.DashboardSnapshot": {
"urn": "urn:li:dashboard:(preset,1)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.dashboard.DashboardInfo": {
"customProperties": {
"Status": "published",
"IsPublished": "true",
"Owners": "test_username_1, test_username_2",
"IsCertified": "true",
"CertifiedBy": "Certification team",
"CertificationDetails": "Approved"
},
"title": "test_dashboard_title_1",
"description": "",
"charts": [
"urn:li:chart:(preset,10)",
"urn:li:chart:(preset,11)"
],
"datasets": [],
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 1720594800000,
"actor": "urn:li:corpuser:test_username_1"
}
},
"dashboardUrl": "mock://mock-domain.preset.io/dashboard/test_dashboard_url_1"
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-2024_07_10-07_00_00",
"lastRunId": "no-run-id-provided",
"pipelineName": "test_pipeline"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": {
"urn": "urn:li:chart:(preset,10)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.chart.ChartInfo": {
"customProperties": {
"Metrics": "",
"Filters": "",
"Dimensions": ""
},
"title": "test_chart_title_1",
"description": "",
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 1720594800000,
"actor": "urn:li:corpuser:test_username_1"
}
},
"chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_10",
"inputs": [
{
"string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)"
}
],
"type": "BAR"
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-2024_07_10-07_00_00",
"lastRunId": "no-run-id-provided",
"pipelineName": "test_pipeline"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": {
"urn": "urn:li:chart:(preset,11)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.chart.ChartInfo": {
"customProperties": {
"Metrics": "",
"Filters": "",
"Dimensions": ""
},
"title": "test_chart_title_2",
"description": "",
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 1720594800000,
"actor": "urn:li:corpuser:test_username_1"
}
},
"chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_11",
"inputs": [
{
"string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)"
}
],
"type": "PIE"
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-2024_07_10-07_00_00",
"lastRunId": "no-run-id-provided",
"pipelineName": "test_pipeline"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": {
"urn": "urn:li:chart:(preset,12)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.chart.ChartInfo": {
"customProperties": {
"Metrics": "",
"Filters": "",
"Dimensions": ""
},
"title": "test_chart_title_3",
"description": "",
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 1720594800000,
"actor": "urn:li:corpuser:test_username_2"
}
},
"chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_12",
"inputs": [
{
"string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)"
}
],
"type": "AREA"
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-2024_07_10-07_00_00",
"lastRunId": "no-run-id-provided",
"pipelineName": "test_pipeline"
}
},
{
"proposedSnapshot": {
"com.linkedin.pegasus2avro.metadata.snapshot.ChartSnapshot": {
"urn": "urn:li:chart:(preset,13)",
"aspects": [
{
"com.linkedin.pegasus2avro.common.Status": {
"removed": false
}
},
{
"com.linkedin.pegasus2avro.chart.ChartInfo": {
"customProperties": {
"Metrics": "",
"Filters": "",
"Dimensions": ""
},
"title": "test_chart_title_4",
"description": "",
"lastModified": {
"created": {
"time": 0,
"actor": "urn:li:corpuser:unknown"
},
"lastModified": {
"time": 1720594800000,
"actor": "urn:li:corpuser:test_username_2"
}
},
"chartUrl": "mock://mock-domain.preset.io/explore/test_chart_url_13",
"inputs": [
{
"string": "urn:li:dataset:(urn:li:dataPlatform:external,test_database_name.test_schema_name.test_table_name,PROD)"
}
],
"type": "HISTOGRAM"
}
}
]
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-2024_07_10-07_00_00",
"lastRunId": "no-run-id-provided",
"pipelineName": "test_pipeline"
}
},
{
"entityType": "dashboard",
"entityUrn": "urn:li:dashboard:(preset,2)",
"changeType": "UPSERT",
"aspectName": "status",
"aspect": {
"json": {
"removed": true
}
},
"systemMetadata": {
"lastObserved": 1720594800000,
"runId": "preset-2024_07_10-07_00_00",
"lastRunId": "no-run-id-provided",
"pipelineName": "test_pipeline"
}
}
]

View File

@ -0,0 +1,366 @@
from typing import Any, Dict, Optional
from unittest.mock import patch
import pytest
from freezegun import freeze_time
from datahub.ingestion.run.pipeline import Pipeline
from tests.test_helpers import mce_helpers
from tests.test_helpers.state_helpers import (
get_current_checkpoint_from_pipeline,
run_and_get_pipeline,
validate_all_providers_have_committed_successfully,
)
FROZEN_TIME = "2024-07-10 07:00:00"
GMS_PORT = 8080
GMS_SERVER = f"http://localhost:{GMS_PORT}"
def register_mock_api(request_mock: Any, override_data: Optional[dict] = None) -> None:
if override_data is None:
override_data = {}
api_vs_response = {
"mock://mock-domain.preset.io/v1/auth/": {
"method": "POST",
"status_code": 200,
"json": {
"payload": {
"access_token": "test_token",
}
},
},
"mock://mock-domain.preset.io/version": {
"method": "GET",
"status_code": 200,
"json": {
"ci": {
"built_at": "Tue Jul 10 00:00:00 UTC 2024",
"build_num": "1",
"triggered_by": "Not triggered by a user",
},
"git": {
"branch": "4.0.1.6",
"sha": "test_sha",
"sha_superset": "test_sha_superset",
"release_name": "test_release_name",
},
"chart_version": "1.16.1",
"start_time": "2024-07-10 00:00:00",
"mt_deployment": True,
},
},
"mock://mock-domain.preset.io/api/v1/dashboard/": {
"method": "GET",
"status_code": 200,
"json": {
"count": 2,
"result": [
{
"id": "1",
"changed_by": {
"username": "test_username_1",
},
"changed_on_utc": "2024-07-10T07:00:00.000000+0000",
"dashboard_title": "test_dashboard_title_1",
"url": "/dashboard/test_dashboard_url_1",
"position_json": '{"CHART-test-1": {"meta": { "chartId": "10" }}, "CHART-test-2": {"meta": { "chartId": "11" }}}',
"status": "published",
"published": True,
"owners": [
{
"username": "test_username_1",
},
{
"username": "test_username_2",
},
],
"certified_by": "Certification team",
"certification_details": "Approved",
},
{
"id": "2",
"changed_by": {
"username": "test_username_2",
},
"changed_on_utc": "2024-07-10T07:00:00.000000+0000",
"dashboard_title": "test_dashboard_title_2",
"url": "/dashboard/test_dashboard_url_2",
"position_json": '{"CHART-test-3": {"meta": { "chartId": "12" }}, "CHART-test-4": {"meta": { "chartId": "13" }}}',
"status": "draft",
"published": False,
"owners": [
{
"first_name": "name",
},
],
"certified_by": "",
"certification_details": "",
},
],
},
},
"mock://mock-domain.preset.io/api/v1/chart/": {
"method": "GET",
"status_code": 200,
"json": {
"count": 4,
"result": [
{
"id": "10",
"changed_by": {
"username": "test_username_1",
},
"changed_on_utc": "2024-07-10T07:00:00.000000+0000",
"slice_name": "test_chart_title_1",
"viz_type": "box_plot",
"url": "/explore/test_chart_url_10",
"datasource_id": "20",
"params": '{"metrics": [], "adhoc_filters": []}',
},
{
"id": "11",
"changed_by": {
"username": "test_username_1",
},
"changed_on_utc": "2024-07-10T07:00:00.000000+0000",
"slice_name": "test_chart_title_2",
"viz_type": "pie",
"url": "/explore/test_chart_url_11",
"datasource_id": "20",
"params": '{"metrics": [], "adhoc_filters": []}',
},
{
"id": "12",
"changed_by": {
"username": "test_username_2",
},
"changed_on_utc": "2024-07-10T07:00:00.000000+0000",
"slice_name": "test_chart_title_3",
"viz_type": "treemap",
"url": "/explore/test_chart_url_12",
"datasource_id": "20",
"params": '{"metrics": [], "adhoc_filters": []}',
},
{
"id": "13",
"changed_by": {
"username": "test_username_2",
},
"changed_on_utc": "2024-07-10T07:00:00.000000+0000",
"slice_name": "test_chart_title_4",
"viz_type": "histogram",
"url": "/explore/test_chart_url_13",
"datasource_id": "20",
"params": '{"metrics": [], "adhoc_filters": []}',
},
],
},
},
"mock://mock-domain.preset.io/api/v1/dataset/20": {
"method": "GET",
"status_code": 200,
"json": {
"result": {
"schema": "test_schema_name",
"table_name": "test_table_name",
"database": {
"id": "30",
"database_name": "test_database_name",
},
},
},
},
"mock://mock-domain.preset.io/api/v1/database/30": {
"method": "GET",
"status_code": 200,
"json": {
"result": {
"sqlalchemy_uri": "test_sqlalchemy_uri",
},
},
},
}
api_vs_response.update(override_data)
for url in api_vs_response:
request_mock.register_uri(
api_vs_response[url]["method"],
url,
json=api_vs_response[url]["json"],
status_code=api_vs_response[url]["status_code"],
)
@freeze_time(FROZEN_TIME)
@pytest.mark.integration
def test_preset_ingest(pytestconfig, tmp_path, mock_time, requests_mock):
test_resources_dir = pytestconfig.rootpath / "tests/integration/preset"
register_mock_api(request_mock=requests_mock)
pipeline = Pipeline.create(
{
"run_id": "preset-test",
"source": {
"type": "preset",
"config": {
"connect_uri": "mock://mock-domain.preset.io/",
"manager_uri": "mock://mock-domain.preset.io",
"api_key": "test_key",
"api_secret": "test_secret",
"provider": "db",
},
},
"sink": {
"type": "file",
"config": {
"filename": f"{tmp_path}/preset_mces.json",
},
},
}
)
pipeline.run()
pipeline.raise_from_status()
golden_file = "golden_test_ingest.json"
mce_helpers.check_golden_file(
pytestconfig,
output_path=tmp_path / "preset_mces.json",
golden_path=f"{test_resources_dir}/{golden_file}",
)
@freeze_time(FROZEN_TIME)
@pytest.mark.integration
def test_preset_stateful_ingest(
pytestconfig, tmp_path, mock_time, requests_mock, mock_datahub_graph
):
test_resources_dir = pytestconfig.rootpath / "tests/integration/preset"
register_mock_api(request_mock=requests_mock)
pipeline_config_dict: Dict[str, Any] = {
"source": {
"type": "preset",
"config": {
"connect_uri": "mock://mock-domain.preset.io/",
"manager_uri": "mock://mock-domain.preset.io",
"api_key": "test_key",
"api_secret": "test_secret",
"provider": "db",
# enable stateful ingestion
"stateful_ingestion": {
"enabled": True,
"remove_stale_metadata": True,
"fail_safe_threshold": 100.0,
"state_provider": {
"type": "datahub",
"config": {"datahub_api": {"server": GMS_SERVER}},
},
},
},
},
"sink": {
# we are not really interested in the resulting events for this test
"type": "console"
},
"pipeline_name": "test_pipeline",
}
dashboard_endpoint_override = {
"mock://mock-domain.preset.io/api/v1/dashboard/": {
"method": "GET",
"status_code": 200,
"json": {
"count": 1,
"result": [
{
"id": "1",
"changed_by": {
"username": "test_username_1",
},
"changed_on_utc": "2024-07-10T07:00:00.000000+0000",
"dashboard_title": "test_dashboard_title_1",
"url": "/dashboard/test_dashboard_url_1",
"position_json": '{"CHART-test-1": {"meta": { "chartId": "10" }}, "CHART-test-2": {"meta": { "chartId": "11" }}}',
"status": "published",
"published": True,
"owners": [
{
"username": "test_username_1",
},
{
"username": "test_username_2",
},
],
"certified_by": "Certification team",
"certification_details": "Approved",
},
],
},
},
}
with patch(
"datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph",
mock_datahub_graph,
) as mock_checkpoint:
# Both checkpoint and reporting will use the same mocked graph instance.
mock_checkpoint.return_value = mock_datahub_graph
# Do the first run of the pipeline and get the default job's checkpoint.
pipeline_run1 = run_and_get_pipeline(pipeline_config_dict)
checkpoint1 = get_current_checkpoint_from_pipeline(pipeline_run1)
assert checkpoint1
assert checkpoint1.state
# Remove one dashboard from the preset config.
register_mock_api(
request_mock=requests_mock, override_data=dashboard_endpoint_override
)
# Capture MCEs of second run to validate Status(removed=true)
deleted_mces_path = f"{tmp_path}/preset_deleted_mces.json"
pipeline_config_dict["sink"]["type"] = "file"
pipeline_config_dict["sink"]["config"] = {"filename": deleted_mces_path}
# Do the second run of the pipeline.
pipeline_run2 = run_and_get_pipeline(pipeline_config_dict)
checkpoint2 = get_current_checkpoint_from_pipeline(pipeline_run2)
assert checkpoint2
assert checkpoint2.state
# Perform all assertions on the states. The deleted dashboard should not be
# part of the second state
state1 = checkpoint1.state
state2 = checkpoint2.state
difference_urns = list(
state1.get_urns_not_in(type="dashboard", other_checkpoint_state=state2)
)
assert len(difference_urns) == 1
urn1 = "urn:li:dashboard:(preset,2)"
assert urn1 in difference_urns
# Validate that all providers have committed successfully.
validate_all_providers_have_committed_successfully(
pipeline=pipeline_run1, expected_providers=1
)
validate_all_providers_have_committed_successfully(
pipeline=pipeline_run2, expected_providers=1
)
# Verify the output.
mce_helpers.check_golden_file(
pytestconfig,
output_path=deleted_mces_path,
golden_path=test_resources_dir / "golden_test_stateful_ingest.json",
)

View File

@ -0,0 +1,22 @@
from datahub.ingestion.source.preset import PresetConfig
def test_default_values():
config = PresetConfig.parse_obj({})
assert config.connect_uri == ""
assert config.manager_uri == "https://api.app.preset.io"
assert config.display_uri == ""
assert config.env == "PROD"
assert config.api_key is None
assert config.api_secret is None
def test_set_display_uri():
display_uri = "some_host:1234"
config = PresetConfig.parse_obj({"display_uri": display_uri})
assert config.connect_uri == ""
assert config.manager_uri == "https://api.app.preset.io"
assert config.display_uri == display_uri