Added pagination Tableau data sources graphql (#12187)

* Added pagination tableau graphql

* changed downstream workbook
This commit is contained in:
Onkar Ravgan 2023-06-28 18:27:09 +05:30 committed by GitHub
parent df7f5a7309
commit acf25f4555
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 131 additions and 46 deletions

View File

@ -18,6 +18,7 @@ source:
siteName: site_name
siteUrl: site_url
apiVersion: api_version
paginationLimit: 10
sourceConfig:
config:
type: DashboardMetadata

View File

@ -11,6 +11,7 @@
"""
Wrapper module of TableauServerConnection client
"""
import math
import traceback
from typing import Any, Callable, Dict, List, Optional
@ -23,9 +24,11 @@ from metadata.ingestion.source.dashboard.tableau import (
TABLEAU_GET_WORKBOOKS_PARAM_DICT,
)
from metadata.ingestion.source.dashboard.tableau.models import (
DataSource,
TableauChart,
TableauDashboard,
TableauDatasources,
TableauDatasourcesConnection,
TableauOwner,
)
from metadata.ingestion.source.dashboard.tableau.queries import (
@ -49,7 +52,13 @@ class TableauClient:
_client: TableauServerConnection
def __init__(self, config: Dict[str, Dict[str, Any]], env: str, ssl_verify: bool):
def __init__(
self,
config: Dict[str, Dict[str, Any]],
env: str,
ssl_verify: bool,
pagination_limit: int,
):
# ssl_verify is typed as a `bool` in TableauServerConnection
# However, it is passed as `verify=self.ssl_verify` in each `requests` call.
# In requests (https://requests.readthedocs.io/en/latest/user/advanced/#ssl-cert-verification)
@ -60,6 +69,7 @@ class TableauClient:
ssl_verify=ssl_verify,
)
self._client.sign_in().json()
self.pagination_limit = pagination_limit
@cached_property
def server_info(self) -> Callable:
@ -106,15 +116,25 @@ class TableauClient:
)
]
def get_datasources(self):
def _query_datasources(
self, entities_per_page: int, offset: int
) -> Optional[TableauDatasources]:
"""
Method to query the graphql endpoint to get data sources
"""
try:
datasources_graphql_result = self._client.metadata_graphql_query(
query=TABLEAU_DATASOURCES_QUERY
query=TABLEAU_DATASOURCES_QUERY.format(
first=entities_per_page, offset=offset
)
)
if datasources_graphql_result:
resp = datasources_graphql_result.json()
if resp and resp.get("data"):
return TableauDatasources(**resp.get("data"))
tableau_datasource_connection = TableauDatasourcesConnection(
**resp.get("data")
)
return tableau_datasource_connection.embeddedDatasourcesConnection
except Exception:
logger.debug(traceback.format_exc())
logger.warning(
@ -124,7 +144,32 @@ class TableauClient:
"https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_start.html"
"#enable-the-tableau-metadata-api-for-tableau-server\n"
)
return TableauDatasources(embeddedDatasources=[])
return None
def get_datasources(self) -> Optional[List[DataSource]]:
"""
Paginate and get the list of all data sources
"""
try:
# Query the graphql endpoint once to get total count of data sources
tableau_datasource = self._query_datasources(entities_per_page=1, offset=1)
entities_per_page = min(50, self.pagination_limit)
indexes = math.ceil(tableau_datasource.totalCount / entities_per_page)
# Paginate the results
data_sources = []
for index in range(indexes):
offset = index * entities_per_page
tableau_datasource = self._query_datasources(
entities_per_page=entities_per_page, offset=offset
)
if tableau_datasource:
data_sources.extend(tableau_datasource.nodes)
return data_sources
except Exception:
logger.debug(traceback.format_exc())
logger.warning("Unable to fetch Data Sources")
return None
def sign_out(self) -> None:
self._client.sign_out()

View File

@ -55,6 +55,7 @@ def get_connection(connection: TableauConnection) -> TableauClient:
config=tableau_server_config,
env=connection.env,
ssl_verify=get_verify_ssl(connection.sslConfig),
pagination_limit=connection.paginationLimit,
)
except Exception as exc:
logger.debug(traceback.format_exc())

View File

@ -108,10 +108,9 @@ class TableauSource(DashboardServiceSource):
chart for chart in charts if chart.workbook.id == workbook.id
]
for data_model in data_models.embeddedDatasources:
for downstream_workbooks in data_model.downstreamWorkbooks or []:
if downstream_workbooks.luid == workbook.id:
workbook.dataModels.append(data_model)
for data_model in data_models or []:
if data_model.workbook and data_model.workbook.luid == workbook.id:
workbook.dataModels.append(data_model)
# collect all the tags from charts and workbooks before yielding final entities
if self.source_config.includeTags:

View File

@ -100,7 +100,7 @@ class DatasourceField(BaseModel):
description: Optional[str]
class DownstreamWorkbook(BaseModel):
class Workbook(BaseModel):
id: str
luid: str
name: str
@ -131,12 +131,17 @@ class DataSource(BaseModel):
id: str
name: str
fields: Optional[List[DatasourceField]]
downstreamWorkbooks: Optional[List[DownstreamWorkbook]]
workbook: Optional[Workbook]
upstreamTables: Optional[List[UpstreamTable]]
class TableauDatasources(BaseModel):
embeddedDatasources: Optional[List[DataSource]]
nodes: Optional[List[DataSource]]
totalCount: Optional[int]
class TableauDatasourcesConnection(BaseModel):
embeddedDatasourcesConnection: Optional[TableauDatasources]
class TableauChart(TableauBaseModel):

View File

@ -14,46 +14,49 @@ GraphQL queries used during ingestion
"""
TABLEAU_DATASOURCES_QUERY = """
query {
embeddedDatasources {
id
name
fields {
{{
embeddedDatasourcesConnection(first: {first}, offset: {offset} ) {{
nodes {{
id
name
upstreamColumns{
fields {{
id
name
remoteType
}
fullyQualifiedName
description
}
downstreamWorkbooks {
id
luid
name
}
upstreamTables {
id
luid
name
fullName
schema
referencedByQueries {
upstreamColumns{{
id
name
remoteType
}}
fullyQualifiedName
description
}}
workbook {{
id
luid
name
query
}
columns {
}}
upstreamTables {{
id
luid
name
}
database {
id
name
}
}
}
}
fullName
schema
referencedByQueries {{
id
name
query
}}
columns {{
id
name
}}
database {{
id
name
}}
}}
}}
totalCount
}}
}}
"""

View File

@ -116,6 +116,12 @@ This is a sample config for Tableau:
{% /codeInfo %}
{% codeInfo srNumber=18 %}
**paginationLimit**: The pagination limit will be used while querying the Tableau Graphql endpoint to get the data source information.
{% /codeInfo %}
#### Source Configuration - Source Config
{% codeInfo srNumber=8 %}
@ -186,6 +192,9 @@ source:
```yaml {% srNumber=7 %}
apiVersion: api_version
```
```yaml {% srNumber=18 %}
paginationLimit: pagination_limit
```
```yaml {% srNumber=8 %}
sourceConfig:
config:

View File

@ -116,6 +116,12 @@ This is a sample config for Tableau:
{% /codeInfo %}
{% codeInfo srNumber=11 %}
**paginationLimit**: The pagination limit will be used while querying the Tableau Graphql endpoint to get the data source information.
{% /codeInfo %}
#### Source Configuration - Source Config
{% codeInfo srNumber=8 %}
@ -186,6 +192,9 @@ source:
```yaml {% srNumber=7 %}
apiVersion: api_version
```
```yaml {% srNumber=11 %}
paginationLimit: pagination_limit
```
```yaml {% srNumber=8 %}
sourceConfig:
config:

View File

@ -221,6 +221,7 @@ For more information to get a Personal Access Token please visit this [link](htt
- **Site Name**: This corresponds to the `contentUrl` attribute in the Tableau REST API. The `site_name` is the portion of the URL that follows the `/site/` in the URL.
- **Site URL**: If it is empty, the default Tableau site name will be used.
- **Environment**: The config object can have multiple environments. The default environment is defined as `tableau_prod`, and you can change this if needed by specifying an `env` parameter.
- **Pagination Limit**: The pagination limit will be used while querying the Tableau Graphql endpoint to get the data source information.
{% /extraContent %}

View File

@ -61,6 +61,12 @@
"type": "string",
"default": "tableau_prod"
},
"paginationLimit": {
"title": "Pagination Limit",
"description": "Pagination limit used while querying the tableau metadata API for getting data sources",
"type": "integer",
"default": 10
},
"verifySSL": {
"$ref": "../../../../security/ssl/verifySSLConfig.json#/definitions/verifySSL",
"default": "no-ssl"

View File

@ -85,6 +85,12 @@ $$section
The config object can have multiple environments. The default environment is defined as `tableau_prod`, and you can change this if needed by specifying an `env` parameter.
$$
$$section
### Pagination Limit $(id="paginationLimit")
The pagination limit will be used while querying the Tableau Graphql endpoint to get the data source information.
$$
$$section
### Verify SSL $(id="verifySSL")