Fix#10584: Add filters for data models (#11008)

* Fix#10584: Add filters for data models

* Minor UI error

* fix unit test

---------

Co-authored-by: Ashish Gupta <ashish@getcollate.io>
This commit is contained in:
Nahuel 2023-04-12 14:06:01 +02:00 committed by GitHub
parent dbd8da6a29
commit 673573a512
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 215 additions and 85 deletions

View File

@ -78,7 +78,7 @@ from metadata.ingestion.source.dashboard.looker.models import (
from metadata.ingestion.source.dashboard.looker.parser import LkmlParser
from metadata.readers.github import GitHubReader
from metadata.utils import fqn
from metadata.utils.filters import filter_by_chart
from metadata.utils.filters import filter_by_chart, filter_by_datamodel
from metadata.utils.helpers import clean_uri, get_standard_chart_type
from metadata.utils.logger import ingestion_logger
@ -183,48 +183,55 @@ class LookerSource(DashboardServiceSource):
Get the Explore and View information and prepare
the model creation request
"""
try:
explore_datamodel = CreateDashboardDataModelRequest(
name=build_datamodel_name(model.model_name, model.name),
displayName=model.name,
description=model.description,
service=self.context.dashboard_service.fullyQualifiedName.__root__,
dataModelType=DataModelType.LookMlExplore.value,
serviceType=DashboardServiceType.Looker.value,
columns=get_columns_from_model(model),
sql=self._get_explore_sql(model),
)
yield explore_datamodel
self.status.scanned(f"Data Model Scanned: {model.name}")
# Maybe use the project_name as key too?
# Save the explores for when we create the lineage with the dashboards and views
self._explores_cache[
explore_datamodel.name.__root__
] = self.context.dataModel # This is the newly created explore
# We can get VIEWs from the JOINs to know the dependencies
# We will only try and fetch if we have the credentials
if self.service_connection.githubCredentials:
for view in model.joins:
yield from self._process_view(
view_name=ViewName(view.name), explore=model
if self.source_config.includeDataModels:
try:
datamodel_name = build_datamodel_name(model.model_name, model.name)
if filter_by_datamodel(
self.source_config.dataModelFilterPattern, datamodel_name
):
self.status.filter(datamodel_name, "Data model filtered out.")
else:
explore_datamodel = CreateDashboardDataModelRequest(
name=datamodel_name,
displayName=model.name,
description=model.description,
service=self.context.dashboard_service.fullyQualifiedName.__root__,
dataModelType=DataModelType.LookMlExplore.value,
serviceType=DashboardServiceType.Looker.value,
columns=get_columns_from_model(model),
sql=self._get_explore_sql(model),
)
yield explore_datamodel
self.status.scanned(f"Data Model Scanned: {model.name}")
except ValidationError as err:
error = f"Validation error yielding Data Model [{model.name}]: {err}"
logger.debug(traceback.format_exc())
logger.error(error)
self.status.failed(
name=model.name, error=error, stack_trace=traceback.format_exc()
)
except Exception as err:
error = f"Wild error yielding Data Model [{model.name}]: {err}"
logger.debug(traceback.format_exc())
logger.error(error)
self.status.failed(
name=model.name, error=error, stack_trace=traceback.format_exc()
)
# Maybe use the project_name as key too?
# Save the explores for when we create the lineage with the dashboards and views
self._explores_cache[
explore_datamodel.name.__root__
] = self.context.dataModel # This is the newly created explore
# We can get VIEWs from the JOINs to know the dependencies
# We will only try and fetch if we have the credentials
if self.service_connection.githubCredentials:
for view in model.joins:
yield from self._process_view(
view_name=ViewName(view.name), explore=model
)
except ValidationError as err:
error = f"Validation error yielding Data Model [{model.name}]: {err}"
logger.debug(traceback.format_exc())
logger.error(error)
self.status.failed(
name=model.name, error=error, stack_trace=traceback.format_exc()
)
except Exception as err:
error = f"Wild error yielding Data Model [{model.name}]: {err}"
logger.debug(traceback.format_exc())
logger.error(error)
self.status.failed(
name=model.name, error=error, stack_trace=traceback.format_exc()
)
def _get_explore_sql(self, explore: LookmlModelExplore) -> Optional[str]:
"""

View File

@ -59,7 +59,7 @@ from metadata.ingestion.source.dashboard.tableau.models import (
)
from metadata.ingestion.source.database.column_type_parser import ColumnTypeParser
from metadata.utils import fqn, tag_utils
from metadata.utils.filters import filter_by_chart
from metadata.utils.filters import filter_by_chart, filter_by_datamodel
from metadata.utils.helpers import get_standard_chart_type
from metadata.utils.logger import ingestion_logger
@ -198,45 +198,56 @@ class TableauSource(DashboardServiceSource):
def yield_datamodel(
self, dashboard_details: TableauDashboard
) -> Iterable[CreateDashboardDataModelRequest]:
data_models: TableauSheets = TableauSheets()
for chart in dashboard_details.charts:
try:
data_models = self.client.get_sheets(chart.id)
except Exception as exc:
error_msg = f"Error fetching Data Model for sheet {chart.name} - {exc}"
self.status.failed(
name=chart.name, error=error_msg, stack_trace=traceback.format_exc()
)
logger.error(error_msg)
logger.debug(traceback.format_exc())
for data_model in data_models.sheets:
if self.source_config.includeDataModels:
data_models: TableauSheets = TableauSheets()
for chart in dashboard_details.charts:
try:
data_model_request = CreateDashboardDataModelRequest(
name=data_model.id,
displayName=data_model.name,
description=data_model.description,
service=self.context.dashboard_service.fullyQualifiedName.__root__,
dataModelType=DataModelType.TableauSheet.value,
serviceType=DashboardServiceType.Tableau.value,
columns=self.get_column_info(data_model),
)
yield data_model_request
self.sheets.add(data_model)
self.status.scanned(
f"Data Model Scanned: {data_model_request.name.__root__}"
)
data_models = self.client.get_sheets(chart.id)
except Exception as exc:
error_msg = f"Error yeilding Data Model - {data_model.name} - {exc}"
error_msg = (
f"Error fetching Data Model for sheet {chart.name} - {exc}"
)
self.status.failed(
name=data_model.name,
name=chart.name,
error=error_msg,
stack_trace=traceback.format_exc(),
)
logger.error(error_msg)
logger.debug(traceback.format_exc())
for data_model in data_models.sheets:
if filter_by_datamodel(
self.source_config.dataModelFilterPattern, data_model.name
):
self.status.filter(data_model.name, "Data model filtered out.")
continue
try:
data_model_request = CreateDashboardDataModelRequest(
name=data_model.id,
displayName=data_model.name,
description=data_model.description,
service=self.context.dashboard_service.fullyQualifiedName.__root__,
dataModelType=DataModelType.TableauSheet.value,
serviceType=DashboardServiceType.Tableau.value,
columns=self.get_column_info(data_model),
)
yield data_model_request
self.sheets.add(data_model)
self.status.scanned(
f"Data Model Scanned: {data_model_request.displayName}"
)
except Exception as exc:
error_msg = (
f"Error yielding Data Model [{data_model.name}]: {exc}"
)
self.status.failed(
name=data_model.name,
error=error_msg,
stack_trace=traceback.format_exc(),
)
logger.error(error_msg)
logger.debug(traceback.format_exc())
def yield_dashboard(
self, dashboard_details: TableauDashboard
) -> Iterable[CreateDashboardRequest]:
@ -389,7 +400,7 @@ class TableauSource(DashboardServiceSource):
),
service=self.context.dashboard_service.fullyQualifiedName.__root__,
)
self.status.scanned(chart.id)
self.status.scanned(chart.name)
except Exception as exc:
logger.debug(traceback.format_exc())
logger.warning(f"Error to yield dashboard chart [{chart}]: {exc}")
@ -400,15 +411,14 @@ class TableauSource(DashboardServiceSource):
except ConnectionError as err:
logger.debug(f"Error closing connection - {err}")
def _get_database_table(self, db_service_name, table) -> Table:
database_schema_table = fqn.split_table_name(table.name)
def _get_database_table(self, db_service_name: str, table: DatabaseTable) -> Table:
table_fqn = fqn.build(
self.metadata,
entity_type=Table,
service_name=db_service_name,
schema_name=table.schema_,
table_name=database_schema_table.get("table"),
database_name=database_schema_table.get("database"),
table_name=table.name,
database_name=table.database.name,
)
return self.metadata.get_by_name(
entity=Table,
@ -456,5 +466,6 @@ class TableauSource(DashboardServiceSource):
for colum in sheet.datasourceFields:
for table in colum.upstreamTables:
if table.schema_ and table.name:
table.name = table.name.split(" ")[0].strip()
tables.add(table)
return tables

View File

@ -122,7 +122,7 @@ class DatabaseTable(TableauBaseModel):
"""
schema_: str = Field(..., alias="schema")
upstreamDatabases: Optional[List[TableauBaseModel]]
database: TableauBaseModel
referencedByQueries: Optional[List[CustomSQLTable]]

View File

@ -27,6 +27,20 @@ query SheetQuery {{
__typename
name
id
description
datasource {{
id
name
}}
... on ColumnField {{
dataType
}}
... on CalculatedField {{
dataType
}}
... on GroupField {{
dataType
}}
... on DatasourceField {{
upstreamTables {{
upstreamDatabases {{
@ -41,9 +55,15 @@ query SheetQuery {{
id
name
schema
database {{
id
name
}}
}}
remoteField {{
id
name
description
__typename
... on ColumnField {{
dataType

View File

@ -223,3 +223,18 @@ def filter_by_container(
:return: True for filtering, False otherwise
"""
return _filter(container_filter_pattern, container_name)
def filter_by_datamodel(
datamodel_filter_pattern: Optional[FilterPattern], datamodel_name: str
) -> bool:
"""
Return True if the chart needs to be filtered, False otherwise
Include takes precedence over exclude
:param datamodel_filter_pattern: Model defining data model filtering logic
:param datamodel_name: data model name
:return: True for filtering, False otherwise
"""
return _filter(datamodel_filter_pattern, datamodel_name)

View File

@ -18,11 +18,15 @@
"default": "DashboardMetadata"
},
"dashboardFilterPattern": {
"description": "Regex to only fetch tables or databases that matches the pattern.",
"description": "Regex to exclude or include dashboards that matches the pattern.",
"$ref": "../type/filterPattern.json#/definitions/filterPattern"
},
"chartFilterPattern": {
"description": "Regex exclude tables or databases that matches the pattern.",
"description": "Regex exclude or include charts that matches the pattern.",
"$ref": "../type/filterPattern.json#/definitions/filterPattern"
},
"dataModelFilterPattern": {
"description": "Regex exclude or include data models that matches the pattern.",
"$ref": "../type/filterPattern.json#/definitions/filterPattern"
},
"dbServiceNames": {
@ -45,6 +49,11 @@
"description": "Optional configuration to toggle the tags ingestion.",
"type": "boolean",
"default": true
},
"includeDataModels": {
"description": "Optional configuration to toggle the ingestion of data models.",
"type": "boolean",
"default": true
}
},
"additionalProperties": false

View File

@ -160,6 +160,7 @@ const AddIngestion = ({
showSchemaFilter: !isUndefined(sourceConfig?.schemaFilterPattern),
showTableFilter: !isUndefined(sourceConfig?.tableFilterPattern),
showTopicFilter: !isUndefined(sourceConfig?.topicFilterPattern),
showDataModelFilter: !isUndefined(sourceConfig?.dataModelFilterPattern),
showChartFilter: !isUndefined(sourceConfig?.chartFilterPattern),
showPipelineFilter: !isUndefined(sourceConfig?.pipelineFilterPattern),
showMlModelFilter: !isUndefined(sourceConfig?.mlModelFilterPattern),
@ -172,6 +173,8 @@ const AddIngestion = ({
markDeletedTables: isDatabaseService
? Boolean(sourceConfig?.markDeletedTables ?? true)
: undefined,
dataModelFilterPattern:
sourceConfig?.dataModelFilterPattern ?? INITIAL_FILTER_PATTERN,
dashboardFilterPattern:
sourceConfig?.dashboardFilterPattern ?? INITIAL_FILTER_PATTERN,
containerFilterPattern:
@ -191,6 +194,7 @@ const AddIngestion = ({
markDeletedPipelines: sourceConfig?.markDeletedDashboards ?? true,
includeView: Boolean(sourceConfig?.includeViews),
includeTags: sourceConfig?.includeTags ?? true,
includeDataModels: sourceConfig?.includeDataModels ?? true,
overrideOwner: Boolean(sourceConfig?.overrideOwner),
includeLineage: Boolean(sourceConfig?.includeLineage ?? true),
enableDebugLog: data?.loggerLevel === LogLevels.Debug,
@ -328,12 +332,14 @@ const AddIngestion = ({
const getMetadataIngestionFields = () => {
const {
chartFilterPattern,
dataModelFilterPattern,
dashboardFilterPattern,
databaseFilterPattern,
databaseServiceNames,
includeLineage,
includeTags,
includeView,
includeDataModels,
showContainerFilter,
ingestSampleData,
markAllDeletedTables,
@ -348,6 +354,7 @@ const AddIngestion = ({
schemaFilterPattern,
showChartFilter,
showDashboardFilter,
showDataModelFilter,
showDatabaseFilter,
showMlModelFilter,
showPipelineFilter,
@ -404,11 +411,16 @@ const AddIngestion = ({
dashboardFilterPattern,
showDashboardFilter
),
dataModelFilterPattern: getFilterPatternData(
dataModelFilterPattern,
showDataModelFilter
),
dbServiceNames: databaseServiceNames,
overrideOwner,
type: ConfigType.DashboardMetadata,
markDeletedDashboards,
includeTags,
includeDataModels,
};
}
case ServiceCategory.PIPELINE_SERVICES: {

View File

@ -129,7 +129,7 @@ describe('Test ConfigureIngestion component', () => {
container,
'FilterPattern.component'
);
const toggleSwitchs = await findAllByText(
const toggleSwitch = await findAllByText(
container,
'ToggleSwitchV1.component'
);
@ -138,6 +138,6 @@ describe('Test ConfigureIngestion component', () => {
expect(backButton).toBeInTheDocument();
expect(nextButton).toBeInTheDocument();
expect(filterPatternComponents).toHaveLength(3);
expect(toggleSwitchs).toHaveLength(5);
expect(toggleSwitch).toHaveLength(6);
});
});

View File

@ -50,6 +50,7 @@ const ConfigureIngestion = ({
const markdownRef = useRef<EditorContentRef>();
const {
dataModelFilterPattern,
chartFilterPattern,
dashboardFilterPattern,
databaseFilterPattern,
@ -61,6 +62,7 @@ const ConfigureIngestion = ({
includeLineage,
includeTags,
includeView,
includeDataModels,
ingestionName,
ingestSampleData,
markAllDeletedTables,
@ -76,6 +78,7 @@ const ConfigureIngestion = ({
queryLogDuration,
resultLimit,
schemaFilterPattern,
showDataModelFilter,
showChartFilter,
showDashboardFilter,
showDatabaseFilter,
@ -95,6 +98,7 @@ const ConfigureIngestion = ({
overrideOwner,
} = useMemo(
() => ({
dataModelFilterPattern: data.dataModelFilterPattern,
chartFilterPattern: data.chartFilterPattern,
dashboardFilterPattern: data.dashboardFilterPattern,
databaseFilterPattern: data.databaseFilterPattern,
@ -106,6 +110,7 @@ const ConfigureIngestion = ({
includeLineage: data.includeLineage,
includeTags: data.includeTags,
includeView: data.includeView,
includeDataModels: data.includeDataModels,
ingestionName: data.ingestionName,
ingestSampleData: data.ingestSampleData,
markAllDeletedTables: data.markAllDeletedTables,
@ -117,6 +122,7 @@ const ConfigureIngestion = ({
queryLogDuration: data.queryLogDuration,
resultLimit: data.resultLimit,
schemaFilterPattern: data.schemaFilterPattern,
showDataModelFilter: data.showDataModelFilter,
showChartFilter: data.showChartFilter,
showDashboardFilter: data.showDashboardFilter,
showDatabaseFilter: data.showDatabaseFilter,
@ -193,6 +199,8 @@ const ConfigureIngestion = ({
const handleIncludeTags = () => toggleField('includeTags');
const handleIncludeDataModels = () => toggleField('includeDataModels');
const handleIncludeViewToggle = () => toggleField('includeView');
const handleIngestSampleToggle = () => toggleField('ingestSampleData');
@ -297,7 +305,34 @@ const ConfigureIngestion = ({
/>
</div>
<p className="tw-text-grey-muted tw-mt-3">
{t('message.include-assets-message')}
{t('message.include-assets-message', {
assets: t('label.tag-plural'),
})}
</p>
{getSeparator('')}
</Field>
);
};
const getIncludesDataModelsToggle = () => {
return (
<Field>
<div className="tw-flex tw-gap-1">
<label>
{t('label.include-entity', {
entity: t('label.data-model-plural'),
})}
</label>
<ToggleSwitchV1
checked={includeDataModels}
handleCheck={handleIncludeDataModels}
testId="include-data-models"
/>
</div>
<p className="tw-text-grey-muted tw-mt-3">
{t('message.include-assets-message', {
assets: t('label.data-model-plural'),
})}
</p>
{getSeparator('')}
</Field>
@ -453,6 +488,7 @@ const ConfigureIngestion = ({
{getSeparator('')}
</Field>
{getIncludesTagToggle()}
{getIncludesDataModelsToggle()}
{getDebugLogToggle()}
{getMarkDeletedEntitiesToggle(
t('label.mark-deleted-table-plural'),
@ -668,11 +704,24 @@ const ConfigureIngestion = ({
showSeparator={false}
type={FilterPatternEnum.CHART}
/>
<FilterPattern
checked={showDataModelFilter}
excludePattern={dataModelFilterPattern.excludes ?? []}
getExcludeValue={getExcludeValue}
getIncludeValue={getIncludeValue}
handleChecked={(value) =>
handleShowFilter(value, ShowFilter.showDataModelFilter)
}
includePattern={dataModelFilterPattern.includes ?? []}
showSeparator={false}
type={FilterPatternEnum.DASHBOARD_DATAMODEL}
/>
{getSeparator('')}
{getDashboardDBServiceName()}
{getDebugLogToggle()}
{getOverrideOwnerToggle()}
{getIncludesTagToggle()}
{getIncludesDataModelsToggle()}
{getMarkDeletedEntitiesToggle(
t('label.mark-deleted-entity', {
entity: t('label.dashboard-plural'),

View File

@ -93,6 +93,7 @@ export type ModifiedDbtConfig = DbtConfig &
>;
export interface AddIngestionState {
dataModelFilterPattern: FilterPattern;
chartFilterPattern: FilterPattern;
database?: string;
dashboardFilterPattern: FilterPattern;
@ -110,6 +111,7 @@ export interface AddIngestionState {
includeLineage: boolean;
includeTags: boolean;
includeView: boolean;
includeDataModels: boolean;
ingestionName: string;
ingestSampleData: boolean;
markAllDeletedTables: boolean | undefined;
@ -128,6 +130,7 @@ export interface AddIngestionState {
resultLimit: number;
saveState: LoadingState;
schemaFilterPattern: FilterPattern;
showDataModelFilter: boolean;
showChartFilter: boolean;
showDashboardFilter: boolean;
showDatabaseFilter: boolean;
@ -159,4 +162,5 @@ export enum ShowFilter {
showTableFilter = 'showTableFilter',
showTopicFilter = 'showTopicFilter',
showContainerFilter = 'showContainerFilter',
showDataModelFilter = 'showDataModelFilter',
}

View File

@ -22,4 +22,5 @@ export enum FilterPatternEnum {
PIPELINE = 'pipeline',
MLMODEL = 'mlModel',
CONTAINER = 'container',
DASHBOARD_DATAMODEL = 'dataModel',
}

View File

@ -1037,7 +1037,7 @@
"has-been-created-successfully": "has been created successfully",
"import-glossary-help": "Save time & effort by uploading a CSV file with several glossary terms in one go.",
"in-this-database": "In this Database",
"include-assets-message": "Optional configuration to toggle the tags ingestion.",
"include-assets-message": "Enable extracting {{assets}} from the data source.",
"include-database-filter-extra-information": "Database which was added while creating service.",
"include-lineage-message": "Configuration to turn off fetching lineage from pipelines.",
"ingest-sample-data-for-entity": "Extract sample data from each {{entity}}.",

View File

@ -1037,7 +1037,7 @@
"has-been-created-successfully": "se ha creado exitosamente",
"import-glossary-help": "Ahorre tiempo y esfuerzo cargando un archivo CSV con varios términos de glosario de una sola vez.",
"in-this-database": "En esta base de datos",
"include-assets-message": "Configuración opcional para cambiar la ingestión de etiquetas.",
"include-assets-message": "Configuración opcional para activar la ingesta de {{assets}}.",
"include-database-filter-extra-information": "Base de datos que se agregó al crear el servicio.",
"include-lineage-message": "Configuración para desactivar la obtención de linaje desde pipelines.",
"ingest-sample-data-for-entity": "Extraer datos de muestra de cada {{entity}}.",

View File

@ -886,6 +886,8 @@ export const getFilterTypes = (
return 'tableFilterPattern' as keyof AddIngestionState;
case FilterPatternEnum.CONTAINER:
return 'containerFilterPattern' as keyof AddIngestionState;
case FilterPatternEnum.DASHBOARD_DATAMODEL:
return 'dataModelFilterPattern' as keyof AddIngestionState;
default:
return 'topicFilterPattern' as keyof AddIngestionState;
}