feat(ingest/dbt-cloud): reduce graphql query complexity (#8390)

This commit is contained in:
Harshal Sheth 2023-07-12 04:41:13 -07:00 committed by GitHub
parent ea3e119210
commit ae7e3c2080
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -104,23 +104,20 @@ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS = """
compiledCode compiledCode
""" """
_DBT_GRAPHQL_QUERY = f""" _DBT_FIELDS_BY_TYPE = {
query DatahubMetadataQuery($jobId: Int!, $runId: Int) {{ "models": f"""
models(jobId: $jobId, runId: $runId) {{
{ _DBT_GRAPHQL_COMMON_FIELDS } { _DBT_GRAPHQL_COMMON_FIELDS }
{ _DBT_GRAPHQL_NODE_COMMON_FIELDS } { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
{ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS } { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
dependsOn dependsOn
materializedType materializedType
}} """,
"seeds": f"""
seeds(jobId: $jobId, runId: $runId) {{
{ _DBT_GRAPHQL_COMMON_FIELDS } { _DBT_GRAPHQL_COMMON_FIELDS }
{ _DBT_GRAPHQL_NODE_COMMON_FIELDS } { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
{ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS } { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
}} """,
"sources": f"""
sources(jobId: $jobId, runId: $runId) {{
{ _DBT_GRAPHQL_COMMON_FIELDS } { _DBT_GRAPHQL_COMMON_FIELDS }
{ _DBT_GRAPHQL_NODE_COMMON_FIELDS } { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
identifier identifier
@ -131,9 +128,8 @@ query DatahubMetadataQuery($jobId: Int!, $runId: Int) {{
state state
freshnessChecked freshnessChecked
loader loader
}} """,
"snapshots": f"""
snapshots(jobId: $jobId, runId: $runId) {{
{ _DBT_GRAPHQL_COMMON_FIELDS } { _DBT_GRAPHQL_COMMON_FIELDS }
{ _DBT_GRAPHQL_NODE_COMMON_FIELDS } { _DBT_GRAPHQL_NODE_COMMON_FIELDS }
{ _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS } { _DBT_GRAPHQL_MODEL_SEED_SNAPSHOT_FIELDS }
@ -143,9 +139,8 @@ query DatahubMetadataQuery($jobId: Int!, $runId: Int) {{
parentsModels {{ parentsModels {{
uniqueId uniqueId
}} }}
}} """,
"tests": f"""
tests(jobId: $jobId, runId: $runId) {{
{ _DBT_GRAPHQL_COMMON_FIELDS } { _DBT_GRAPHQL_COMMON_FIELDS }
state state
columnName columnName
@ -159,12 +154,18 @@ query DatahubMetadataQuery($jobId: Int!, $runId: Int) {{
rawCode rawCode
compiledSql compiledSql
compiledCode compiledCode
}} """,
# Currently unsupported dbt node types: # Currently unsupported dbt node types:
# - metrics # - metrics
# - snapshots # - snapshots
# - exposures # - exposures
}
_DBT_GRAPHQL_QUERY = """
query DatahubMetadataQuery_{type}($jobId: Int!, $runId: Int) {{
{type}(jobId: $jobId, runId: $runId) {{
{fields}
}}
}} }}
""" """
@ -206,15 +207,36 @@ class DBTCloudSource(DBTSourceBase):
# Additionally, we'd like to model dbt Cloud jobs/runs in DataHub # Additionally, we'd like to model dbt Cloud jobs/runs in DataHub
# as DataProcesses or DataJobs. # as DataProcesses or DataJobs.
logger.debug("Sending graphql request to the dbt Cloud metadata API") raw_nodes = []
response = requests.post( for node_type, fields in _DBT_FIELDS_BY_TYPE.items():
self.config.metadata_endpoint, logger.info(f"Fetching {node_type} from dbt Cloud")
json={ data = self._send_graphql_query(
"query": _DBT_GRAPHQL_QUERY, query=_DBT_GRAPHQL_QUERY.format(type=node_type, fields=fields),
"variables": { variables={
"jobId": self.config.job_id, "jobId": self.config.job_id,
"runId": self.config.run_id, "runId": self.config.run_id,
}, },
)
raw_nodes.extend(data[node_type])
nodes = [self._parse_into_dbt_node(node) for node in raw_nodes]
additional_metadata: Dict[str, Optional[str]] = {
"project_id": str(self.config.project_id),
"account_id": str(self.config.account_id),
"job_id": str(self.config.job_id),
}
return nodes, additional_metadata
def _send_graphql_query(self, query: str, variables: Dict) -> Dict:
logger.debug(f"Sending GraphQL query to dbt Cloud: {query}")
response = requests.post(
self.config.metadata_endpoint,
json={
"query": query,
"variables": variables,
}, },
headers={ headers={
"Authorization": f"Bearer {self.config.token}", "Authorization": f"Bearer {self.config.token}",
@ -233,23 +255,7 @@ class DBTCloudSource(DBTSourceBase):
response.raise_for_status() response.raise_for_status()
raise e raise e
raw_nodes = [ return data
*data["models"],
*data["seeds"],
*data["sources"],
*data["snapshots"],
*data["tests"],
]
nodes = [self._parse_into_dbt_node(node) for node in raw_nodes]
additional_metadata: Dict[str, Optional[str]] = {
"project_id": str(self.config.project_id),
"account_id": str(self.config.account_id),
"job_id": str(self.config.job_id),
}
return nodes, additional_metadata
def _parse_into_dbt_node(self, node: Dict) -> DBTNode: def _parse_into_dbt_node(self, node: Dict) -> DBTNode:
key = node["uniqueId"] key = node["uniqueId"]