mirror of
https://github.com/open-metadata/OpenMetadata.git
synced 2025-11-03 12:08:31 +00:00
Added dbt cloud multi projects and jobs filter (#18801)
* Added dbt cloud multi project and jobs filter * added tests * change to array type * updated yaml config * added migrations
This commit is contained in:
parent
b9e49fc35d
commit
9a21e77e15
@ -1747,3 +1747,29 @@ WHERE JSON_EXTRACT(json, '$.pipelineType') = 'metadata';
|
||||
UPDATE ingestion_pipeline_entity
|
||||
SET json = JSON_REMOVE(json, '$.sourceConfig.config.processPiiSensitive', '$.sourceConfig.config.confidence', '$.sourceConfig.config.generateSampleData')
|
||||
WHERE JSON_EXTRACT(json, '$.pipelineType') = 'profiler';
|
||||
|
||||
-- Rename 'jobId' to 'jobIds', set 'jobId' as type array in 'jobIds' , add 'projectIds' for dbt cloud
|
||||
UPDATE pipeline_service_entity
|
||||
SET json = JSON_SET(
|
||||
JSON_REMOVE(
|
||||
json,
|
||||
'$.connection.config.jobId'
|
||||
),
|
||||
'$.connection.config.jobIds',
|
||||
IF(
|
||||
JSON_CONTAINS_PATH(json, 'one', '$.connection.config.jobIds'),
|
||||
JSON_EXTRACT(json, '$.connection.config.jobIds'),
|
||||
IF(
|
||||
JSON_EXTRACT(json, '$.connection.config.jobId') IS NOT NULL,
|
||||
JSON_ARRAY(JSON_UNQUOTE(JSON_EXTRACT(json, '$.connection.config.jobId'))),
|
||||
JSON_ARRAY()
|
||||
)
|
||||
),
|
||||
'$.connection.config.projectIds',
|
||||
IF(
|
||||
JSON_CONTAINS_PATH(json, 'one', '$.connection.config.projectIds'),
|
||||
JSON_EXTRACT(json, '$.connection.config.projectIds'),
|
||||
JSON_ARRAY()
|
||||
)
|
||||
)
|
||||
WHERE serviceType = 'DBTCloud';
|
||||
|
||||
@ -1733,4 +1733,26 @@ WHERE json #>> '{pipelineType}' = 'metadata';
|
||||
-- classification and sampling configs from the profiler pipelines
|
||||
UPDATE ingestion_pipeline_entity
|
||||
SET json = json::jsonb #- '{sourceConfig,config,processPiiSensitive}' #- '{sourceConfig,config,confidence}' #- '{sourceConfig,config,generateSampleData}'
|
||||
WHERE json #>> '{pipelineType}' = 'profiler';
|
||||
WHERE json #>> '{pipelineType}' = 'profiler';
|
||||
|
||||
-- set value of 'jobId' as an array into 'jobIds' for dbt cloud
|
||||
UPDATE pipeline_service_entity
|
||||
SET json = (case when json#>>'{connection, config, jobId}' IS NOT null
|
||||
then
|
||||
jsonb_set(json, '{connection, config, jobIds}', to_jsonb(ARRAY[json#>>'{connection, config, jobId}']), true)
|
||||
else
|
||||
jsonb_set(json, '{connection, config, jobIds}', '[]', true)
|
||||
end
|
||||
)
|
||||
WHERE servicetype = 'DBTCloud';
|
||||
|
||||
-- remove 'jobId' after setting 'jobIds' for dbt cloud
|
||||
UPDATE pipeline_service_entity
|
||||
SET json = json::jsonb #- '{connection,config,jobId}'
|
||||
WHERE json#>>'{connection, config, jobId}' IS NOT null
|
||||
and servicetype = 'DBTCloud';
|
||||
|
||||
-- add 'projectIds' for dbt cloud
|
||||
UPDATE pipeline_service_entity
|
||||
SET json = jsonb_set(json, '{connection, config, projectIds}', '[]', true)
|
||||
WHERE servicetype = 'DBTCloud';
|
||||
|
||||
@ -7,7 +7,8 @@ source:
|
||||
host: https://account_prefix.account_region.dbt.com
|
||||
discoveryAPI: https://metadata.cloud.getdbt.com/graphql
|
||||
accountId: "numeric_account_id"
|
||||
# jobId: "numeric_job_id"
|
||||
# jobIds: ["job_id_1", "job_id_2", "job_id_3"]
|
||||
# projectIds: ["project_id_1", "project_id_2", "project_id_3"]
|
||||
token: auth_token
|
||||
sourceConfig:
|
||||
config:
|
||||
|
||||
@ -35,6 +35,7 @@ from metadata.utils.helpers import clean_uri
|
||||
from metadata.utils.logger import ometa_logger
|
||||
|
||||
logger = ometa_logger()
|
||||
API_VERSION = "api/v2"
|
||||
|
||||
|
||||
class DBTCloudClient:
|
||||
@ -44,9 +45,13 @@ class DBTCloudClient:
|
||||
|
||||
def __init__(self, config: DBTCloudConnection):
|
||||
self.config = config
|
||||
|
||||
self.job_ids = self.config.jobIds
|
||||
self.project_ids = self.config.projectIds
|
||||
|
||||
client_config: ClientConfig = ClientConfig(
|
||||
base_url=clean_uri(self.config.host),
|
||||
api_version="api/v2",
|
||||
api_version=API_VERSION,
|
||||
auth_header=AUTHORIZATION_HEADER,
|
||||
auth_token=lambda: (self.config.token.get_secret_value(), 0),
|
||||
allow_redirects=True,
|
||||
@ -63,27 +68,43 @@ class DBTCloudClient:
|
||||
self.client = REST(client_config)
|
||||
self.graphql_client = REST(graphql_client_config)
|
||||
|
||||
def test_get_jobs(self) -> Optional[List[DBTJob]]:
|
||||
def _get_jobs(
|
||||
self, job_id: str = None, project_id: str = None
|
||||
) -> Optional[List[DBTJob]]:
|
||||
"""
|
||||
fetch jobs for an account in dbt cloud
|
||||
"""
|
||||
job_list = []
|
||||
try:
|
||||
job_path = f"{job_id}/" if job_id else ""
|
||||
project_path = f"?project_id={project_id}" if project_id else ""
|
||||
result = self.client.get(
|
||||
f"/accounts/{self.config.accountId}/jobs/{job_path}{project_path}"
|
||||
)
|
||||
job_list = (
|
||||
[DBTJob.model_validate(result["data"])]
|
||||
if job_id
|
||||
else DBTJobList.model_validate(result).Jobs
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.debug(traceback.format_exc())
|
||||
logger.error(
|
||||
f"Failed to get job info for project_id: `{project_id}` or job_id: `{job_id}` : {exc}"
|
||||
)
|
||||
return job_list
|
||||
|
||||
def test_get_jobs(self) -> List[DBTJob]:
|
||||
"""
|
||||
test fetch jobs for an account in dbt cloud
|
||||
"""
|
||||
job_path = f"{self.config.jobId}/" if self.config.jobId else ""
|
||||
result = self.client.get(f"/accounts/{self.config.accountId}/jobs/{job_path}")
|
||||
job_list = (
|
||||
[DBTJob.model_validate(result["data"])]
|
||||
if self.config.jobId
|
||||
else DBTJobList.model_validate(result).Jobs
|
||||
)
|
||||
return job_list
|
||||
job_list = self.client.get(f"/accounts/{self.config.accountId}/jobs/")
|
||||
return DBTJobList.model_validate(job_list).Jobs
|
||||
|
||||
def test_get_runs(self, job_id: int) -> Optional[List[DBTRun]]:
|
||||
def test_get_runs(self) -> List[DBTRun]:
|
||||
"""
|
||||
test fetch runs for a job in dbt cloud
|
||||
"""
|
||||
result = self.client.get(
|
||||
f"/accounts/{self.config.accountId}/runs/",
|
||||
data={"job_definition_id": job_id},
|
||||
)
|
||||
result = self.client.get(f"/accounts/{self.config.accountId}/runs/")
|
||||
run_list = DBTRunList.model_validate(result).Runs
|
||||
return run_list
|
||||
|
||||
@ -92,17 +113,29 @@ class DBTCloudClient:
|
||||
list jobs for an account in dbt cloud
|
||||
"""
|
||||
try:
|
||||
job_path = f"{self.config.jobId}/" if self.config.jobId else ""
|
||||
result = self.client.get(
|
||||
f"/accounts/{self.config.accountId}/jobs/{job_path}"
|
||||
)
|
||||
if result:
|
||||
job_list = (
|
||||
[DBTJob.model_validate(result.get("data"))]
|
||||
if self.config.jobId
|
||||
else DBTJobList.model_validate(result).Jobs
|
||||
)
|
||||
return job_list
|
||||
jobs = []
|
||||
# case when job_ids are specified and project_ids are not
|
||||
if self.job_ids and not self.project_ids:
|
||||
for job_id in self.job_ids:
|
||||
jobs.extend(self._get_jobs(job_id=job_id))
|
||||
# case when project_ids are specified or both are specified
|
||||
elif self.project_ids:
|
||||
for project_id in self.project_ids:
|
||||
results = self._get_jobs(project_id=project_id)
|
||||
if self.job_ids:
|
||||
jobs.extend(
|
||||
[
|
||||
result
|
||||
for result in results
|
||||
if str(result.id) in self.job_ids
|
||||
]
|
||||
)
|
||||
else:
|
||||
jobs.extend(results)
|
||||
else:
|
||||
results = self._get_jobs()
|
||||
jobs.extend(results)
|
||||
return jobs
|
||||
except Exception as exc:
|
||||
logger.debug(traceback.format_exc())
|
||||
logger.error(f"Unable to get job info :{exc}")
|
||||
@ -141,6 +174,9 @@ class DBTCloudClient:
|
||||
|
||||
if result.get("data") and result["data"].get("job"):
|
||||
model_list = DBTModelList.model_validate(result["data"]["job"]).models
|
||||
logger.debug(
|
||||
f"Successfully fetched models from dbt for job_id:{job_id} run_id:{run_id}: {model_list}"
|
||||
)
|
||||
return model_list
|
||||
|
||||
except Exception as exc:
|
||||
@ -150,7 +186,7 @@ class DBTCloudClient:
|
||||
|
||||
def get_models_and_seeds_details(self, job_id: int, run_id: int):
|
||||
"""
|
||||
get model details for a job in dbt cloud for lineage
|
||||
get parent model details for a job in dbt cloud for lineage
|
||||
"""
|
||||
try:
|
||||
query_params = {
|
||||
@ -163,9 +199,12 @@ class DBTCloudClient:
|
||||
if result.get("data") and result["data"].get("job"):
|
||||
result = DBTModelList.model_validate(result["data"]["job"])
|
||||
parents_list = result.models + result.seeds
|
||||
logger.debug(
|
||||
f"Successfully fetched parent models from dbt for job_id:{job_id} run_id:{run_id}: {parents_list}"
|
||||
)
|
||||
return parents_list
|
||||
|
||||
except Exception as exc:
|
||||
logger.debug(traceback.format_exc())
|
||||
logger.warning(f"Unable to get model info :{exc}")
|
||||
logger.warning(f"Unable to get parents model info :{exc}")
|
||||
return None
|
||||
|
||||
@ -50,11 +50,9 @@ def test_connection(
|
||||
of a metadata workflow or during an Automation Workflow
|
||||
"""
|
||||
|
||||
job_id = int(service_connection.jobId) if service_connection.jobId else 0
|
||||
|
||||
test_fn = {
|
||||
"GetJobs": client.test_get_jobs,
|
||||
"GetRuns": partial(client.test_get_runs, job_id=job_id),
|
||||
"GetRuns": partial(client.test_get_runs),
|
||||
}
|
||||
|
||||
return test_connection_steps(
|
||||
|
||||
@ -35,7 +35,7 @@ class DBTJob(BaseModel):
|
||||
|
||||
|
||||
class DBTJobList(BaseModel):
|
||||
Jobs: Optional[List[DBTJob]] = Field([], alias="data")
|
||||
Jobs: List[DBTJob] = Field(alias="data")
|
||||
|
||||
|
||||
class DBTRun(BaseModel):
|
||||
|
||||
@ -386,7 +386,8 @@ mock_dbtcloud_config = {
|
||||
"host": "https://abc12.us1.dbt.com",
|
||||
"discoveryAPI": "https://metadata.cloud.getdbt.com/graphql",
|
||||
"accountId": "70403103922125",
|
||||
"jobId": "70403103922125",
|
||||
"jobIds": ["70403103922125", "70403103922126"],
|
||||
"projectIds": ["70403103922127", "70403103922128"],
|
||||
"token": "dbt_token",
|
||||
}
|
||||
},
|
||||
@ -510,6 +511,10 @@ MOCK_PIPELINE = Pipeline(
|
||||
sourceHash=None,
|
||||
)
|
||||
|
||||
EXPECTED_JOB_FILTERS = ["70403103922125", "70403103922126"]
|
||||
|
||||
EXPECTED_PROJECT_FILTERS = ["70403103922127", "70403103922128"]
|
||||
|
||||
EXPECTED_PIPELINE_NAME = str(MOCK_JOB_RESULT["data"][0]["name"])
|
||||
|
||||
|
||||
@ -547,6 +552,10 @@ class DBTCloudUnitTest(TestCase):
|
||||
== EXPECTED_PIPELINE_NAME
|
||||
)
|
||||
|
||||
def test_filters_to_list(self):
|
||||
assert self.dbtcloud.client.job_ids == EXPECTED_JOB_FILTERS
|
||||
assert self.dbtcloud.client.project_ids == EXPECTED_PROJECT_FILTERS
|
||||
|
||||
def test_pipelines(self):
|
||||
pipeline = list(self.dbtcloud.yield_pipeline(EXPECTED_JOB_DETAILS))[0].right
|
||||
assert pipeline == EXPECTED_CREATED_PIPELINES
|
||||
|
||||
@ -69,7 +69,11 @@ To know more about permissions required refer [here](https://docs.getdbt.com/doc
|
||||
|
||||
- **Account Id** : The Account ID of your DBT cloud Project. Go to your dbt cloud account settings to know your Account Id. This will be a numeric value but in openmetadata we parse it as a string.
|
||||
|
||||
- **Job Id** : Optional. The Job ID of your DBT cloud Job in your Project to fetch metadata for. Look for the segment after "jobs" in the URL. For instance, in a URL like `https://cloud.getdbt.com/accounts/123/projects/87477/jobs/73659994`, the job ID is `73659994`. This will be a numeric value but in openmetadata we parse it as a string. If not passed all Jobs under the Account id will be ingested.
|
||||
- **Job Ids** : Optional. Job IDs of your DBT cloud Jobs in your Project to fetch metadata for. Look for the segment after "jobs" in the URL. For instance, in a URL like `https://cloud.getdbt.com/accounts/123/projects/87477/jobs/73659994`, the job ID is `73659994`. This will be a numeric value but in openmetadata we parse it as a string. If not passed all Jobs under the Account id will be ingested.
|
||||
|
||||
- **Project Ids** : Optional. Project IDs of your DBT cloud Account to fetch metadata for. Look for the segment after "projects" in the URL. For instance, in a URL like `https://cloud.getdbt.com/accounts/123/projects/87477/jobs/73659994`, the job ID is `87477`. This will be a numeric value but in openmetadata we parse it as a string. If not passed all Projects under the Account id will be ingested.
|
||||
|
||||
Note that if both `Job Ids` and `Project Ids` are passed then it will filter out the jobs from the passed projects. any `Job Ids` not belonging to the `Project Ids` will also be filtered out.
|
||||
|
||||
- **Token** : The Authentication Token of your DBT cloud API Account. To get your access token you can follow the docs [here](https://docs.getdbt.com/docs/dbt-cloud-apis/authentication).
|
||||
Make sure you have the necessary permissions on the token to run graphql queries and get job and run details.
|
||||
|
||||
@ -70,12 +70,20 @@ This is a sample config for dbt Cloud:
|
||||
|
||||
{% codeInfo srNumber=4 %}
|
||||
|
||||
**jobId**: Optional. The Job ID of your DBT cloud Job in your Project to fetch metadata for. Look for the segment after "jobs" in the URL. For instance, in a URL like `https://cloud.getdbt.com/accounts/123/projects/87477/jobs/73659994`, the job ID is `73659994`. This will be a numeric value but in openmetadata we parse it as a string. If not passed all Jobs under the Account id will be ingested.
|
||||
**jobIds**: Optional. Job IDs of your DBT cloud Jobs in your Project to fetch metadata for. Look for the segment after "jobs" in the URL. For instance, in a URL like `https://cloud.getdbt.com/accounts/123/projects/87477/jobs/73659994`, the job ID is `73659994`. This will be a numeric value but in openmetadata we parse it as a string. If not passed all Jobs under the Account id will be ingested.
|
||||
|
||||
{% /codeInfo %}
|
||||
|
||||
{% codeInfo srNumber=5 %}
|
||||
|
||||
**projectIds**: Optional. Project IDs of your DBT cloud Account to fetch metadata for. Look for the segment after "projects" in the URL. For instance, in a URL like `https://cloud.getdbt.com/accounts/123/projects/87477/jobs/73659994`, the job ID is `87477`. This will be a numeric value but in openmetadata we parse it as a string. If not passed all Projects under the Account id will be ingested.
|
||||
|
||||
Note that if both `Job Ids` and `Project Ids` are passed then it will filter out the jobs from the passed projects. any `Job Ids` not belonging to the `Project Ids` will also be filtered out.
|
||||
|
||||
{% /codeInfo %}
|
||||
|
||||
{% codeInfo srNumber=6 %}
|
||||
|
||||
**token**: The Authentication Token of your DBT cloud API Account. To get your access token you can follow the docs [here](https://docs.getdbt.com/docs/dbt-cloud-apis/authentication).
|
||||
Make sure you have the necessary permissions on the token to run graphql queries and get job and run details.
|
||||
|
||||
@ -111,9 +119,12 @@ source:
|
||||
accountId: "numeric_account_id"
|
||||
```
|
||||
```yaml {% srNumber=4 %}
|
||||
# jobId: "numeric_job_id"
|
||||
# jobIds: ["job_id_1", "job_id_2", "job_id_3"]
|
||||
```
|
||||
```yaml {% srNumber=5 %}
|
||||
# projectIds: ["project_id_1", "project_id_2", "project_id_3"]
|
||||
```
|
||||
```yaml {% srNumber=6 %}
|
||||
token: auth_token
|
||||
```
|
||||
|
||||
|
||||
@ -39,11 +39,21 @@
|
||||
"description": "ID of your DBT cloud account",
|
||||
"type": "string"
|
||||
},
|
||||
"jobId": {
|
||||
"title": "Job Id",
|
||||
"description": "ID of your DBT cloud job",
|
||||
"type": "string",
|
||||
"default": null
|
||||
"jobIds": {
|
||||
"title": "Job Ids",
|
||||
"description": "List of IDs of your DBT cloud jobs seperated by comma `,`",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"projectIds": {
|
||||
"title": "Project Ids",
|
||||
"description": "List of IDs of your DBT cloud projects seperated by comma `,`",
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"token": {
|
||||
"title": "Token",
|
||||
@ -54,5 +64,4 @@
|
||||
},
|
||||
"additionalProperties": false,
|
||||
"required": ["host", "discoveryAPI", "accountId", "token"]
|
||||
}
|
||||
|
||||
}
|
||||
@ -26,8 +26,15 @@ The Account ID of your DBT cloud Project. Go to your dbt cloud account settings
|
||||
$$
|
||||
|
||||
$$section
|
||||
### Job Id $(id="jobId")
|
||||
The Job ID of your DBT cloud Job in your Project to fetch metadata for. Look for the segment after "jobs" in the URL. For instance, in a URL like `https://cloud.getdbt.com/accounts/123/projects/87477/jobs/73659994`, the job ID is `73659994`. This will be a numeric value but in openmetadata we parse it as a string. If not passed all Jobs under the Account id will be ingested. `Optional`
|
||||
### Job Ids $(id="jobIds")
|
||||
Job IDs of your DBT cloud Jobs in your Project to fetch metadata for. Look for the segment after "jobs" in the URL. For instance, in a URL like `https://cloud.getdbt.com/accounts/123/projects/87477/jobs/73659994`, the job ID is `73659994`. This will be a numeric value but in openmetadata we parse it as a string. If not passed all Jobs under the Account Id will be ingested. `Optional`
|
||||
$$
|
||||
|
||||
$$section
|
||||
### Project Ids $(id="projectIds")
|
||||
Project IDs of your DBT cloud Account to fetch metadata for. Look for the segment after "projects" in the URL. For instance, in a URL like `https://cloud.getdbt.com/accounts/123/projects/87477/jobs/73659994`, the job ID is `87477`. This will be a numeric value but in openmetadata we parse it as a string. If not passed all Projects under the Account Id will be ingested. `Optional`
|
||||
|
||||
Note that if both `Job Ids` and `Project Ids` are passed then it will filter out the jobs from the passed projects. any `Job Ids` not belonging to the `Project Ids` will also be filtered out.
|
||||
$$
|
||||
|
||||
$$section
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user