Added pagination Tableau data sources graphql (#12187)

* Added pagination tableau graphql

* changed downstream workbook
This commit is contained in:
Onkar Ravgan 2023-06-28 18:27:09 +05:30 committed by GitHub
parent df7f5a7309
commit acf25f4555
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 131 additions and 46 deletions

View File

@ -18,6 +18,7 @@ source:
siteName: site_name siteName: site_name
siteUrl: site_url siteUrl: site_url
apiVersion: api_version apiVersion: api_version
paginationLimit: 10
sourceConfig: sourceConfig:
config: config:
type: DashboardMetadata type: DashboardMetadata

View File

@ -11,6 +11,7 @@
""" """
Wrapper module of TableauServerConnection client Wrapper module of TableauServerConnection client
""" """
import math
import traceback import traceback
from typing import Any, Callable, Dict, List, Optional from typing import Any, Callable, Dict, List, Optional
@ -23,9 +24,11 @@ from metadata.ingestion.source.dashboard.tableau import (
TABLEAU_GET_WORKBOOKS_PARAM_DICT, TABLEAU_GET_WORKBOOKS_PARAM_DICT,
) )
from metadata.ingestion.source.dashboard.tableau.models import ( from metadata.ingestion.source.dashboard.tableau.models import (
DataSource,
TableauChart, TableauChart,
TableauDashboard, TableauDashboard,
TableauDatasources, TableauDatasources,
TableauDatasourcesConnection,
TableauOwner, TableauOwner,
) )
from metadata.ingestion.source.dashboard.tableau.queries import ( from metadata.ingestion.source.dashboard.tableau.queries import (
@ -49,7 +52,13 @@ class TableauClient:
_client: TableauServerConnection _client: TableauServerConnection
def __init__(self, config: Dict[str, Dict[str, Any]], env: str, ssl_verify: bool): def __init__(
self,
config: Dict[str, Dict[str, Any]],
env: str,
ssl_verify: bool,
pagination_limit: int,
):
# ssl_verify is typed as a `bool` in TableauServerConnection # ssl_verify is typed as a `bool` in TableauServerConnection
# However, it is passed as `verify=self.ssl_verify` in each `requests` call. # However, it is passed as `verify=self.ssl_verify` in each `requests` call.
# In requests (https://requests.readthedocs.io/en/latest/user/advanced/#ssl-cert-verification) # In requests (https://requests.readthedocs.io/en/latest/user/advanced/#ssl-cert-verification)
@ -60,6 +69,7 @@ class TableauClient:
ssl_verify=ssl_verify, ssl_verify=ssl_verify,
) )
self._client.sign_in().json() self._client.sign_in().json()
self.pagination_limit = pagination_limit
@cached_property @cached_property
def server_info(self) -> Callable: def server_info(self) -> Callable:
@ -106,15 +116,25 @@ class TableauClient:
) )
] ]
def get_datasources(self): def _query_datasources(
self, entities_per_page: int, offset: int
) -> Optional[TableauDatasources]:
"""
Method to query the graphql endpoint to get data sources
"""
try: try:
datasources_graphql_result = self._client.metadata_graphql_query( datasources_graphql_result = self._client.metadata_graphql_query(
query=TABLEAU_DATASOURCES_QUERY query=TABLEAU_DATASOURCES_QUERY.format(
first=entities_per_page, offset=offset
)
) )
if datasources_graphql_result: if datasources_graphql_result:
resp = datasources_graphql_result.json() resp = datasources_graphql_result.json()
if resp and resp.get("data"): if resp and resp.get("data"):
return TableauDatasources(**resp.get("data")) tableau_datasource_connection = TableauDatasourcesConnection(
**resp.get("data")
)
return tableau_datasource_connection.embeddedDatasourcesConnection
except Exception: except Exception:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())
logger.warning( logger.warning(
@ -124,7 +144,32 @@ class TableauClient:
"https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html" "https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html"
"#enable-the-tableau-metadata-api-for-tableau-server\n" "#enable-the-tableau-metadata-api-for-tableau-server\n"
) )
return TableauDatasources(embeddedDatasources=[]) return None
def get_datasources(self) -> Optional[List[DataSource]]:
"""
Paginate and get the list of all data sources
"""
try:
# Query the graphql endpoint once to get total count of data sources
tableau_datasource = self._query_datasources(entities_per_page=1, offset=1)
entities_per_page = min(50, self.pagination_limit)
indexes = math.ceil(tableau_datasource.totalCount / entities_per_page)
# Paginate the results
data_sources = []
for index in range(indexes):
offset = index * entities_per_page
tableau_datasource = self._query_datasources(
entities_per_page=entities_per_page, offset=offset
)
if tableau_datasource:
data_sources.extend(tableau_datasource.nodes)
return data_sources
except Exception:
logger.debug(traceback.format_exc())
logger.warning("Unable to fetch Data Sources")
return None
def sign_out(self) -> None: def sign_out(self) -> None:
self._client.sign_out() self._client.sign_out()

View File

@ -55,6 +55,7 @@ def get_connection(connection: TableauConnection) -> TableauClient:
config=tableau_server_config, config=tableau_server_config,
env=connection.env, env=connection.env,
ssl_verify=get_verify_ssl(connection.sslConfig), ssl_verify=get_verify_ssl(connection.sslConfig),
pagination_limit=connection.paginationLimit,
) )
except Exception as exc: except Exception as exc:
logger.debug(traceback.format_exc()) logger.debug(traceback.format_exc())

View File

@ -108,10 +108,9 @@ class TableauSource(DashboardServiceSource):
chart for chart in charts if chart.workbook.id == workbook.id chart for chart in charts if chart.workbook.id == workbook.id
] ]
for data_model in data_models.embeddedDatasources: for data_model in data_models or []:
for downstream_workbooks in data_model.downstreamWorkbooks or []: if data_model.workbook and data_model.workbook.luid == workbook.id:
if downstream_workbooks.luid == workbook.id: workbook.dataModels.append(data_model)
workbook.dataModels.append(data_model)
# collect all the tags from charts and workbooks before yielding final entities # collect all the tags from charts and workbooks before yielding final entities
if self.source_config.includeTags: if self.source_config.includeTags:

View File

@ -100,7 +100,7 @@ class DatasourceField(BaseModel):
description: Optional[str] description: Optional[str]
class DownstreamWorkbook(BaseModel): class Workbook(BaseModel):
id: str id: str
luid: str luid: str
name: str name: str
@ -131,12 +131,17 @@ class DataSource(BaseModel):
id: str id: str
name: str name: str
fields: Optional[List[DatasourceField]] fields: Optional[List[DatasourceField]]
downstreamWorkbooks: Optional[List[DownstreamWorkbook]] workbook: Optional[Workbook]
upstreamTables: Optional[List[UpstreamTable]] upstreamTables: Optional[List[UpstreamTable]]
class TableauDatasources(BaseModel): class TableauDatasources(BaseModel):
embeddedDatasources: Optional[List[DataSource]] nodes: Optional[List[DataSource]]
totalCount: Optional[int]
class TableauDatasourcesConnection(BaseModel):
embeddedDatasourcesConnection: Optional[TableauDatasources]
class TableauChart(TableauBaseModel): class TableauChart(TableauBaseModel):

View File

@ -14,46 +14,49 @@ GraphQL queries used during ingestion
""" """
TABLEAU_DATASOURCES_QUERY = """ TABLEAU_DATASOURCES_QUERY = """
query { {{
embeddedDatasources { embeddedDatasourcesConnection(first: {first}, offset: {offset} ) {{
id nodes {{
name
fields {
id id
name name
upstreamColumns{ fields {{
id id
name name
remoteType upstreamColumns{{
} id
fullyQualifiedName name
description remoteType
} }}
downstreamWorkbooks { fullyQualifiedName
id description
luid }}
name workbook {{
}
upstreamTables {
id
luid
name
fullName
schema
referencedByQueries {
id id
luid
name name
query }}
} upstreamTables {{
columns {
id id
luid
name name
} fullName
database { schema
id referencedByQueries {{
name id
} name
} query
} }}
} columns {{
id
name
}}
database {{
id
name
}}
}}
}}
totalCount
}}
}}
""" """

View File

@ -116,6 +116,12 @@ This is a sample config for Tableau:
{% /codeInfo %} {% /codeInfo %}
{% codeInfo srNumber=18 %}
**paginationLimit**: The pagination limit will be used while querying the Tableau Graphql endpoint to get the data source information.
{% /codeInfo %}
#### Source Configuration - Source Config #### Source Configuration - Source Config
{% codeInfo srNumber=8 %} {% codeInfo srNumber=8 %}
@ -186,6 +192,9 @@ source:
```yaml {% srNumber=7 %} ```yaml {% srNumber=7 %}
apiVersion: api_version apiVersion: api_version
``` ```
```yaml {% srNumber=18 %}
paginationLimit: pagination_limit
```
```yaml {% srNumber=8 %} ```yaml {% srNumber=8 %}
sourceConfig: sourceConfig:
config: config:

View File

@ -116,6 +116,12 @@ This is a sample config for Tableau:
{% /codeInfo %} {% /codeInfo %}
{% codeInfo srNumber=11 %}
**paginationLimit**: The pagination limit will be used while querying the Tableau Graphql endpoint to get the data source information.
{% /codeInfo %}
#### Source Configuration - Source Config #### Source Configuration - Source Config
{% codeInfo srNumber=8 %} {% codeInfo srNumber=8 %}
@ -186,6 +192,9 @@ source:
```yaml {% srNumber=7 %} ```yaml {% srNumber=7 %}
apiVersion: api_version apiVersion: api_version
``` ```
```yaml {% srNumber=11 %}
paginationLimit: pagination_limit
```
```yaml {% srNumber=8 %} ```yaml {% srNumber=8 %}
sourceConfig: sourceConfig:
config: config:

View File

@ -221,6 +221,7 @@ For more information to get a Personal Access Token please visit this [link](htt
- **Site Name**: This corresponds to the `contentUrl` attribute in the Tableau REST API. The `site_name` is the portion of the URL that follows the `/site/` in the URL. - **Site Name**: This corresponds to the `contentUrl` attribute in the Tableau REST API. The `site_name` is the portion of the URL that follows the `/site/` in the URL.
- **Site URL**: If it is empty, the default Tableau site name will be used. - **Site URL**: If it is empty, the default Tableau site name will be used.
- **Environment**: The config object can have multiple environments. The default environment is defined as `tableau_prod`, and you can change this if needed by specifying an `env` parameter. - **Environment**: The config object can have multiple environments. The default environment is defined as `tableau_prod`, and you can change this if needed by specifying an `env` parameter.
- **Pagination Limit**: The pagination limit will be used while querying the Tableau Graphql endpoint to get the data source information.
{% /extraContent %} {% /extraContent %}

View File

@ -61,6 +61,12 @@
"type": "string", "type": "string",
"default": "tableau_prod" "default": "tableau_prod"
}, },
"paginationLimit": {
"title": "Pagination Limit",
"description": "Pagination limit used while querying the tableau metadata API for getting data sources",
"type": "integer",
"default": 10
},
"verifySSL": { "verifySSL": {
"$ref": "../../../../security/ssl/verifySSLConfig.json#/definitions/verifySSL", "$ref": "../../../../security/ssl/verifySSLConfig.json#/definitions/verifySSL",
"default": "no-ssl" "default": "no-ssl"

View File

@ -85,6 +85,12 @@ $$section
The config object can have multiple environments. The default environment is defined as `tableau_prod`, and you can change this if needed by specifying an `env` parameter. The config object can have multiple environments. The default environment is defined as `tableau_prod`, and you can change this if needed by specifying an `env` parameter.
$$ $$
$$section
### Pagination Limit $(id="paginationLimit")
The pagination limit will be used while querying the Tableau Graphql endpoint to get the data source information.
$$
$$section $$section
### Verify SSL $(id="verifySSL") ### Verify SSL $(id="verifySSL")