mirror of
https://github.com/datahub-project/datahub.git
synced 2025-09-03 22:33:25 +00:00
feat(tableau): use pagination for all connection queries (#5204)
This commit is contained in:
parent
baf3f3f33c
commit
022ef2f17c
@ -44,7 +44,7 @@ Workbooks from Tableau are ingested as Container in datahub. <br/>
|
|||||||
- GraphQL query <br/>
|
- GraphQL query <br/>
|
||||||
```graphql
|
```graphql
|
||||||
{
|
{
|
||||||
workbooksConnection(first: 15, offset: 0, filter: {projectNameWithin: ["default", "Project 2"]}) {
|
workbooksConnection(first: 10, offset: 0, filter: {projectNameWithin: ["default", "Project 2"]}) {
|
||||||
nodes {
|
nodes {
|
||||||
id
|
id
|
||||||
name
|
name
|
||||||
@ -73,7 +73,7 @@ Dashboards from Tableau are ingested as Dashboard in datahub. <br/>
|
|||||||
- GraphQL query <br/>
|
- GraphQL query <br/>
|
||||||
```graphql
|
```graphql
|
||||||
{
|
{
|
||||||
workbooksConnection(first: 15, offset: 0, filter: {projectNameWithin: ["default", "Project 2"]}) {
|
workbooksConnection(first: 10, offset: 0, filter: {projectNameWithin: ["default", "Project 2"]}) {
|
||||||
nodes {
|
nodes {
|
||||||
.....
|
.....
|
||||||
dashboards {
|
dashboards {
|
||||||
@ -185,7 +185,7 @@ Embedded Data source from Tableau is ingested as a Dataset in datahub.
|
|||||||
- GraphQL query <br/>
|
- GraphQL query <br/>
|
||||||
```graphql
|
```graphql
|
||||||
{
|
{
|
||||||
workbooksConnection(first: 15, offset: 0, filter: {projectNameWithin: ["default"]}) {
|
workbooksConnection(first: 10, offset: 0, filter: {projectNameWithin: ["default"]}) {
|
||||||
nodes {
|
nodes {
|
||||||
....
|
....
|
||||||
embeddedDatasources {
|
embeddedDatasources {
|
||||||
@ -265,7 +265,7 @@ Published Data source from Tableau is ingested as a Dataset in datahub.
|
|||||||
- GraphQL query <br/>
|
- GraphQL query <br/>
|
||||||
```graphql
|
```graphql
|
||||||
{
|
{
|
||||||
publishedDatasourcesConnection(filter: {idWithin: ["00cce29f-b561-bb41-3557-8e19660bb5dd", "618c87db-5959-338b-bcc7-6f5f4cc0b6c6"]}) {
|
publishedDatasourcesConnection(first: 10, offset: 0, filter: {idWithin: ["00cce29f-b561-bb41-3557-8e19660bb5dd", "618c87db-5959-338b-bcc7-6f5f4cc0b6c6"]}) {
|
||||||
nodes {
|
nodes {
|
||||||
__typename
|
__typename
|
||||||
id
|
id
|
||||||
@ -343,7 +343,7 @@ For custom sql data sources, the query is viewable in UI under View Definition t
|
|||||||
- GraphQL query <br/>
|
- GraphQL query <br/>
|
||||||
```graphql
|
```graphql
|
||||||
{
|
{
|
||||||
customSQLTablesConnection(filter: {idWithin: ["22b0b4c3-6b85-713d-a161-5a87fdd78f40"]}) {
|
customSQLTablesConnection(first: 10, offset: 0, filter: {idWithin: ["22b0b4c3-6b85-713d-a161-5a87fdd78f40"]}) {
|
||||||
nodes {
|
nodes {
|
||||||
id
|
id
|
||||||
name
|
name
|
||||||
@ -408,8 +408,8 @@ Lineage is emitted as received from Tableau's metadata API for
|
|||||||
|
|
||||||
## Troubleshooting
|
## Troubleshooting
|
||||||
|
|
||||||
### Why are only some workbooks ingested from the specified project?
|
### Why are only some workbooks/custom SQLs/published datasources ingested from the specified project?
|
||||||
|
|
||||||
This may happen when the Tableau API returns NODE_LIMIT_EXCEEDED error in response to metadata query and returns partial results with message "Showing partial results. , The request exceeded the ‘n’ node limit. Use pagination, additional filtering, or both in the query to adjust results." To resolve this, consider
|
This may happen when the Tableau API returns NODE_LIMIT_EXCEEDED error in response to metadata query and returns partial results with message "Showing partial results. , The request exceeded the ‘n’ node limit. Use pagination, additional filtering, or both in the query to adjust results." To resolve this, consider
|
||||||
- reducing the page size using the `workbooks_page_size` config param in datahub recipe (Defaults to 10).
|
- reducing the page size using the `page_size` config param in datahub recipe (Defaults to 10).
|
||||||
- increasing tableau configuration [metadata query node limit](https://help.tableau.com/current/server/en-us/cli_configuration-set_tsm.htm#metadata_nodelimit) to higher value.
|
- increasing tableau configuration [metadata query node limit](https://help.tableau.com/current/server/en-us/cli_configuration-set_tsm.htm#metadata_nodelimit) to higher value.
|
@ -5,7 +5,7 @@ from functools import lru_cache
|
|||||||
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
|
||||||
|
|
||||||
import dateutil.parser as dp
|
import dateutil.parser as dp
|
||||||
from pydantic import validator
|
from pydantic import root_validator, validator
|
||||||
from pydantic.fields import Field
|
from pydantic.fields import Field
|
||||||
from tableauserverclient import (
|
from tableauserverclient import (
|
||||||
PersonalAccessTokenAuth,
|
PersonalAccessTokenAuth,
|
||||||
@ -132,10 +132,16 @@ class TableauConfig(ConfigModel):
|
|||||||
description="Ingest details for tables external to (not embedded in) tableau as entities.",
|
description="Ingest details for tables external to (not embedded in) tableau as entities.",
|
||||||
)
|
)
|
||||||
|
|
||||||
workbooks_page_size: int = Field(
|
workbooks_page_size: Optional[int] = Field(
|
||||||
default=10,
|
default=None,
|
||||||
description="Number of workbooks to query at a time using Tableau api.",
|
description="@deprecated(use page_size instead) Number of workbooks to query at a time using Tableau api.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
page_size: int = Field(
|
||||||
|
default=10,
|
||||||
|
description="Number of metadata objects (e.g. CustomSQLTable, PublishedDatasource, etc) to query at a time using Tableau api.",
|
||||||
|
)
|
||||||
|
|
||||||
env: str = Field(
|
env: str = Field(
|
||||||
default=builder.DEFAULT_ENV,
|
default=builder.DEFAULT_ENV,
|
||||||
description="Environment to use in namespace when constructing URNs.",
|
description="Environment to use in namespace when constructing URNs.",
|
||||||
@ -145,6 +151,17 @@ class TableauConfig(ConfigModel):
|
|||||||
def remove_trailing_slash(cls, v):
|
def remove_trailing_slash(cls, v):
|
||||||
return config_clean.remove_trailing_slashes(v)
|
return config_clean.remove_trailing_slashes(v)
|
||||||
|
|
||||||
|
@root_validator()
|
||||||
|
def show_warning_for_deprecated_config_field(
|
||||||
|
cls, values: Dict[str, Any]
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
if values.get("workbooks_page_size") is not None:
|
||||||
|
logger.warn(
|
||||||
|
"Config workbooks_page_size is deprecated. Please use config page_size instead."
|
||||||
|
)
|
||||||
|
|
||||||
|
return values
|
||||||
|
|
||||||
|
|
||||||
class WorkbookKey(PlatformKey):
|
class WorkbookKey(PlatformKey):
|
||||||
workbook_id: str
|
workbook_id: str
|
||||||
@ -247,6 +264,9 @@ class TableauSource(Source):
|
|||||||
count: int = 0,
|
count: int = 0,
|
||||||
current_count: int = 0,
|
current_count: int = 0,
|
||||||
) -> Tuple[dict, int, int]:
|
) -> Tuple[dict, int, int]:
|
||||||
|
logger.debug(
|
||||||
|
f"Query {connection_type} to get {count} objects with offset {current_count}"
|
||||||
|
)
|
||||||
query_data = query_metadata(
|
query_data = query_metadata(
|
||||||
self.server, query, connection_type, count, current_count, query_filter
|
self.server, query, connection_type, count, current_count, query_filter
|
||||||
)
|
)
|
||||||
@ -267,7 +287,12 @@ class TableauSource(Source):
|
|||||||
has_next_page = connection_object.get("pageInfo", {}).get("hasNextPage", False)
|
has_next_page = connection_object.get("pageInfo", {}).get("hasNextPage", False)
|
||||||
return connection_object, total_count, has_next_page
|
return connection_object, total_count, has_next_page
|
||||||
|
|
||||||
def emit_workbooks(self, workbooks_page_size: int) -> Iterable[MetadataWorkUnit]:
|
def emit_workbooks(self) -> Iterable[MetadataWorkUnit]:
|
||||||
|
count_on_query = (
|
||||||
|
self.config.page_size
|
||||||
|
if self.config.workbooks_page_size is None
|
||||||
|
else self.config.workbooks_page_size
|
||||||
|
)
|
||||||
|
|
||||||
projects = (
|
projects = (
|
||||||
f"projectNameWithin: {json.dumps(self.config.projects)}"
|
f"projectNameWithin: {json.dumps(self.config.projects)}"
|
||||||
@ -282,8 +307,8 @@ class TableauSource(Source):
|
|||||||
current_count = 0
|
current_count = 0
|
||||||
while has_next_page:
|
while has_next_page:
|
||||||
count = (
|
count = (
|
||||||
workbooks_page_size
|
count_on_query
|
||||||
if current_count + workbooks_page_size < total_count
|
if current_count + count_on_query < total_count
|
||||||
else total_count - current_count
|
else total_count - current_count
|
||||||
)
|
)
|
||||||
(
|
(
|
||||||
@ -410,7 +435,7 @@ class TableauSource(Source):
|
|||||||
return upstream_tables
|
return upstream_tables
|
||||||
|
|
||||||
def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]:
|
def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]:
|
||||||
count_on_query = len(self.custom_sql_ids_being_used)
|
count_on_query = self.config.page_size
|
||||||
custom_sql_filter = "idWithin: {}".format(
|
custom_sql_filter = "idWithin: {}".format(
|
||||||
json.dumps(self.custom_sql_ids_being_used)
|
json.dumps(self.custom_sql_ids_being_used)
|
||||||
)
|
)
|
||||||
@ -779,7 +804,7 @@ class TableauSource(Source):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
|
def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
|
||||||
count_on_query = len(self.datasource_ids_being_used)
|
count_on_query = self.config.page_size
|
||||||
datasource_filter = "idWithin: {}".format(
|
datasource_filter = "idWithin: {}".format(
|
||||||
json.dumps(self.datasource_ids_being_used)
|
json.dumps(self.datasource_ids_being_used)
|
||||||
)
|
)
|
||||||
@ -1148,7 +1173,7 @@ class TableauSource(Source):
|
|||||||
if self.server is None or not self.server.is_signed_in():
|
if self.server is None or not self.server.is_signed_in():
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
yield from self.emit_workbooks(self.config.workbooks_page_size)
|
yield from self.emit_workbooks()
|
||||||
if self.datasource_ids_being_used:
|
if self.datasource_ids_being_used:
|
||||||
yield from self.emit_published_datasources()
|
yield from self.emit_published_datasources()
|
||||||
if self.custom_sql_ids_being_used:
|
if self.custom_sql_ids_being_used:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user